In [None]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import seaborn 
import tensorflow as tf
import tensorflow_data_validation as tfdv
from sklearn.neighbors import NearestNeighbors
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
import gc
from tqdm import tqdm 

def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def reduce_memory(df):
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            cmin = df[col].min()
            cmax = df[col].max()
            if str(col_type)[:3] == 'int':
                if cmin > np.iinfo(np.int8).min and cmax < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif cmin > np.iinfo(np.int16).min and cmax < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif cmin > np.iinfo(np.int32).min and cmax < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif cmin > np.iinfo(np.int64).min and cmax < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if cmin > np.finfo(np.float16).min and cmax < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif cmin > np.finfo(np.float32).min and cmax < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

# EDA

In [None]:
print('TF version:', tf.__version__)

In [None]:
ss = pd.read_csv('sample_submission.csv')
ss.head()

In [None]:
serving = pd.read_csv('test.csv')
serving.head()

In [None]:
serving.shape

In [None]:
pairs = pd.read_csv('pairs.csv')
pairs = reduce_memory(pairs)
print(pairs.shape)
print(pairs.columns)
pairs.head()

In [None]:
pair_stats = tfdv.generate_statistics_from_dataframe(pairs)
tfdv.visualize_statistics(pair_stats)

In [None]:
df = pd.read_csv('train.csv')
df = reduce_memory(df)
print(df.shape)
df.head()

In [None]:
train_stats = tfdv.generate_statistics_from_dataframe(df)
tfdv.visualize_statistics(train_stats)

In [None]:
schema = tfdv.infer_schema(train_stats)
tfdv.display_schema(schema)

In [None]:
from matplotlib_venn import venn3
from matplotlib_venn import venn2
venn3([set(pairs['id_1'].values.astype('str')),set(df['id'].values.astype('str')),set(pairs['id_2'].values.astype('str'))],('pairs1','train','pairs2'))
plt.show()

In [None]:
venn2([set(df['id'].values.astype('str')),set(pairs['id_2'].values.astype('str'))],('train','pairs2'))
plt.show()

In [None]:
venn2([set(df['id'].values.astype('str')),set(pairs['id_1'].values.astype('str'))],('train','pairs'))
plt.show()

In [None]:
set_of_no_match_ids  = set(df['id'].values.astype('str')) - set(pairs['id_1'].values.astype('str')).union(list(set(pairs['id_2'].values.astype('str'))))
len(set_of_no_match_ids)

# DATA prep and modeling 

In [None]:
match_df = pd.merge(df, df, on="point_of_interest", suffixes=('_1', '_2'))
match_df = match_df[match_df["id_1"]!=match_df["id_2"]]
match_df = match_df.drop(["point_of_interest"], axis=1)
match_df["match"] = True
print(match_df.shape)
match_df.fillna("NA",inplace = True)
match_df.head()

In [None]:
from matplotlib_venn import venn2
venn2([set(match_df['id_1'].values.astype('str')),set(pairs['id_1'].values.astype('str'))],('train','pairs'))
plt.show()

In [None]:
grouped_df = match_df.groupby('id_1')

In [None]:
dict_df = dict(list(grouped_df))

In [None]:
set_keys = set(match_df['id_1'].values)
keys = list(set(match_df['id_1'].values))

In [None]:
dict_non_match = {}
def non_match(keys) :
#     global counter
#     print(counter)
    temp  = set_keys.copy()
    temp2 = dict_df[keys]['id_1'].iloc[0]
    temp_nm = temp - set(dict_df[keys]['id_2'].values)
    dict_non_match[temp2] = random.sample(list(temp_nm),40)
    #counter+=1
    #return dict_non_match

In [None]:
len(keys)

In [None]:
import random 
random_keys = random.sample(keys,10000)
random_keys[1]

In [None]:
temp = list(map(non_match, random_keys)) # random_keys

In [None]:
ind = list(df['id'].values)
def id_to_idx(id, ind = ind):
    return ind.index(id)
     

In [None]:
from multiprocessing.dummy import Pool as ThreadPool 
def create_non_match_tuples(i, keys, values):
    list_non_match = []

    values  = random.sample(values,40)
    row_1 = df.iloc[id_to_idx(keys)]
    for x in tqdm(values):
        row_2 = df.iloc[id_to_idx(x)]
        pair = (row_1,row_2)
        list_non_match.append(pair)
    return i,list_non_match
pool = ThreadPool(8)
%timeit
results = pool.starmap(create_non_match_tuples, zip(range(len(dict_non_match)),list(dict_non_match.keys()),list(dict_non_match.values()))) 
pool.close() 
pool.join()

In [None]:
list_non_match = [i[1] for i in results]

In [None]:
nm_df = pairs[pairs['match'] == 0]

In [None]:
nm_df.shape

In [None]:
idx_temp = list(list_non_match[0][0][0].index)
idx_1 = [i+'_1' for i in idx_temp]
idx_2 = [i+'_2' for i in idx_temp]
idx = idx_1 + idx_2
print(idx)

In [None]:
temp = []
for y in list_non_match:
    for x in y:
        temp.append(list(x[0])+list(x[1]))
    
    
arr  = np.stack(temp)

nm2 = pd.DataFrame(data = arr, columns = idx)

nm2.drop(['point_of_interest_2', 'point_of_interest_1'],inplace= True, axis =1)
nm2['match'] = [0 for i in range(nm2.shape[0])]
print(nm2.shape)
nm2.head()

In [None]:
nm_df = pd.concat([nm_df,nm2])

In [None]:
nm_df.fillna("NA", inplace = True)
print(nm_df.isnull().any().sum())
print(nm_df.shape)
nm_df.head()

In [None]:
nm_df = pd.concat([nm_df,match_df.iloc[:600000]])
nm_df.to_csv('full_train.csv',index  = False)

In [None]:
nm_df = pd.read_csv('full_train.csv')


In [None]:
nm_df.fillna("NAN",inplace  = True)
nm_df.isna().sum()

In [None]:
# Features to use 
X = nm_df[['latitude_1','longitude_1','latitude_2','longitude_2','categories_1','categories_2',]]
y = nm_df['match']

In [None]:
# del X
# del y
# gc.collect()

del X_train, X_test, y_train, y_test
gc.collect()

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [None]:
import seaborn as sns
sns.distplot([len(i.split()) for i in X['categories_1'] ])

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
num_words = 100000
oov_token = '<UNK>'
pad_type = 'pre'
trunc_type = 'pre'
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(X_train['categories_1'] + X_train['categories_2'])

# Get our training data word index
word_index = tokenizer.word_index

# # Encode training data sentences into sequences
# train_sequences_1 = tokenizer.texts_to_sequences(X_train['categories_1'])
# train_sequences_2 = tokenizer.texts_to_sequences(X_train['categories_2'])

# # Pad the training sequences
# train_padded_1 = pad_sequences(train_sequences_1, padding=pad_type, truncating=trunc_type, maxlen=12)
# train_padded_2 = pad_sequences(train_sequences_1, padding=pad_type, truncating=trunc_type, maxlen=12)


# print("Word index:\n", word_index)
# print("\nTraining sequences:\n", train_sequences_1)
# print("\nPadded training sequences:\n", train_padded_1)
# print("\nPadded training shape:", train_padded_1.shape)
# print("Training sequences data type:", type(train_sequences_1))
# print("Padded Training sequences data type:", type(train_padded_1))

In [None]:
# Save tokenizer as a pickle

import pickle

dict1 = {'foo': tokenizer}

# Store data (serialize)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(dict1, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load data (deserialize)
with open('tokenizer.pickle', 'rb') as handle:
    fq = pickle.load(handle)

fq['foo'].word_index

In [None]:
def pre_process(X):
    category,lat, lon = X[:,0],X[:,1],X[:,2]
    seq =  tokenizer.texts_to_sequences(category)
    seq = pad_sequences(seq, padding=pad_type, truncating=trunc_type, maxlen=12)
    seq = np.concatenate([seq,np.reshape(lat,(-1,1))], axis = 1)
    seq = np.concatenate([seq,np.reshape(lon,(-1,1))], axis = 1)
    return seq
#pre_processed = pre_process(nm_df[nm_df['match']==1][['categories_1','latitude_1','longitude_1']].values)

In [None]:
def preprocess_doubelets(anchor, validation):
    """
    Given the filenames corresponding to the three images, load and
    preprocess them.
    """

    return (
        pre_process(anchor).astype('float32'),
        pre_process(validation).astype('float32')
      
    )

In [None]:
value_1 = nm_df[['categories_1','latitude_1','longitude_1']].values
value_2 = nm_df[['categories_2','latitude_2','longitude_2']].values

In [None]:
from itertools import starmap
pre = list(starmap(preprocess_doubelets,[(value_1,value_2)]))

In [None]:
# pos  = tf.data.Dataset.from_tensor_slices(pre[0][0])
# neg  = tf.data.Dataset.from_tensor_slices(pre[0][1])
# y_true = tf.data.Dataset.from_tensor_slices(nm_df['match'].values)

In [None]:
# dataset = tf.data.Dataset.zip((pos, neg,)
# dataset = dataset.shuffle(buffer_size=1024)
# dataset = dataset.map(preprocess_doubelets)

# # Let's now split our dataset in train and validation.
# train_dataset = dataset.take(round(image_count * 0.8))
# val_dataset = dataset.skip(round(image_count * 0.8))

# train_dataset = train_dataset.batch(32, drop_remainder=False)
# train_dataset = train_dataset.prefetch(8)

# val_dataset = val_dataset.batch(32, drop_remainder=False)
# val_dataset = val_dataset.prefetch(8)

In [None]:
import tensorflow as tf 
from tensorflow.keras.layers import Dense,BatchNormalization,Dropout,Embedding,Input,Concatenate,Reshape, Multiply, Subtract, Add, Multiply, Dropout, Subtract, Add,Lambda
from tensorflow.keras import Model
from keras.regularizers import l2
from keras.models import Sequential
from keras.optimizers import Adam



In [None]:
from keras import backend as K

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)


In [None]:
from sklearn.metrics import roc_auc_score

def auroc(y_true, y_pred):
    return tf.numpy_function(roc_auc_score, (y_true, y_pred), tf.double,stateful=False)

In [None]:
input_1 = Input(shape = (14,))
# input_12 = Input(shape = (2,))
# input_2 = Input(shape = (12,))
# input_22 = Input(shape = (2,))

x = Embedding( len(word_index) ,100, input_length = 14)(input_1)
x = Reshape((1400,))(x)
x = Dense(128,activation = 'relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(256,activation = 'relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(512,activation = 'relu')(x)


embeddings = Model(inputs = [input_1], outputs = x)
embeddings.summary()

In [None]:
input_2 = Input(shape = (14,))
input_3 = Input(shape = (14,))

embeddings_1 = embeddings(input_2)
embeddings_2 = embeddings(input_3)



x3 = Subtract()([embeddings_1, embeddings_2])
x3 = Multiply()([x3, x3])

x1_ = Multiply()([embeddings_1, embeddings_1])
x2_ = Multiply()([embeddings_2, embeddings_2])
x4 = Subtract()([x1_, x2_])
    
    #https://stackoverflow.com/a/51003359/10650182
x5 = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([embeddings_1, embeddings_2])
    
conc = Concatenate(axis=-1)([x5,x4, x3])

x = Dense(100, activation="relu", name='conc_layer')(conc)
x = Dropout(0.01)(x)
out = Dense(1, activation="sigmoid", name = 'out')(x)

model = Model([input_2, input_3], out)
model.summary()
model.compile(loss="binary_crossentropy", metrics=['acc',auroc], optimizer=Adam(0.00001))

In [None]:
tensorboard  =tf.keras.callbacks.TensorBoard(
    log_dir='logs'
)
early_stop  =tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=.001,
    patience=5,
    verbose=1,
    mode="auto",
    restore_best_weights=True,
)



In [None]:
 
model.fit([pre[0][0][X_train],pre[0][1][X_train]],nm_df['match'][y_train], epochs = 100,
       validation_data=([pre[0][0][X_test],pre[0][1][X_test]],nm_df['match'][y_test]),batch_size = 128,steps_per_epoch=len(X_test)//128,
    validation_steps=len(X_test)//128,
    validation_batch_size=128, callbacks = [tensorboard, early_stop])

In [None]:
model.save('cosine.h5')

In [None]:
model.load_weights('cosine.h5')

In [None]:
embedding_model = Model(model.get_layer('model').inputs,model.get_layer('model').outputs)
embedding_model.summary()

In [None]:
extracted_embeddings = []

prev = 0
for f,i in tqdm(enumerate(range(2000,1780121,2000))):
    j = range(prev,i)
    extracted_embeddings.append(embedding_model.predict(pre[0][0][j],verbose=0))
    prev = i
    

In [None]:
extracted_embeddings  = np.vstack(extracted_embeddings)
extracted_embeddings.shape

In [None]:
a = nm_df[nm_df['match'] == 0].index[:10000]
b = nm_df[nm_df['match'] == 1].index[:10000]

In [None]:
from sklearn.manifold import TSNE
X = np.vstack([extracted_embeddings[a] ,extracted_embeddings[b]])
X_embedded = TSNE(n_components=2, learning_rate='auto',
                   init='random').fit_transform(X)
X_embedded.shape

In [None]:
# To plot the embedding
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(X_embedded[:,0], X_embedded[:,1], c = np.vstack([nm_df['match'].values[a] ,nm_df['match'].values[b]]), s = 0.5)

In [None]:
serving.shape

# Predict only on nearest neighbours

In [None]:
def generate_test_data(df, rounds = 3, n_neighbors = 5, features = ['id', 'latitude', 'longitude','categories']):
    # Scale data for KNN
#     scaler = StandardScaler()
    scaled_data = df[features[1:3]]
    print(df.shape)
    # Fit KNN and predict indices
    knn_model = NearestNeighbors(
        n_neighbors = n_neighbors, 
        radius = 1.0, 
        algorithm = 'kd_tree', 
        leaf_size = 30, 
        metric = 'minkowski', 
        p = 2, 
        n_jobs = -1
    )
    knn_model.fit(scaled_data)
    indices = knn_model.kneighbors(scaled_data, return_distance = False)
    # Create a new dataframe to slice faster
    df_features = df[features]
    # Create a dataset to store final results
    dataset = []
    # Iterate through each round and get generated data
    for j in range(rounds):
        # Create temporal dataset to store round data
        tmp_dataset = []
        # Iterate through each row
        for k in tqdm(range(len(df))):
            neighbors = list(indices[k])
            # Remove self from neighbors if exist
            try:
                neighbors.remove(k)
            except:
                pass
            # Use iterator as first indices
            ind1 = k
            # Select from the neighbor list the second indices
            ind2 = neighbors[j]
            # Check if indices are the same, they should not be the same
            if ind1 == ind2:
                print('Indices are the same, error')
            # Slice features dataframe
            tmp1 = df_features.loc[ind1]
            tmp2 = df_features.loc[ind2]
            # Concatenate, don't add target, this is the test set
            tmp = np.concatenate([tmp1, tmp2], axis = 0)
            tmp_dataset.append(tmp)  
        # Transform tmp_dataset to a pd.DataFrame
        tmp_dataset = pd.DataFrame(tmp_dataset, columns = [i + '_1' for i in features] + [i + '_2' for i in features])
        # Append round
        dataset.append(tmp_dataset)
    # Concatenate rounds to get final dataset
    dataset = pd.concat(dataset, axis = 0)
    # Remove duplicates
    dataset.drop_duplicates(inplace = True)
    # Reset index
    dataset.reset_index(drop = True, inplace = True)
    col_64 = list(dataset.dtypes[dataset.dtypes == np.float64].index)
    for col in col_64:
        dataset[col] = dataset[col].astype(np.float32)
    return df, dataset

In [None]:
df, dataset = generate_test_data(serving)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.neighbors import NearestNeighbors
from tqdm.notebook import tqdm
import re
from itertools import starmap
serving = pd.read_csv('test.csv')
serving.head()


def build_model():
    input_1 = Input(shape = (14,))
# input_12 = Input(shape = (2,))
# input_2 = Input(shape = (12,))
# input_22 = Input(shape = (2,))

    x = Embedding( len(word_index) ,100, input_length = 14)(input_1)
    x = Reshape((1400,))(x)
    x = Dense(128,activation = 'relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(256,activation = 'relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(512,activation = 'relu')(x)


    embeddings = Model(inputs = [input_1], outputs = x)
    input_2 = Input(shape = (14,))
    input_3 = Input(shape = (14,))

    embeddings_1 = embeddings(input_2)
    embeddings_2 = embeddings(input_3)



    x3 = Subtract()([embeddings_1, embeddings_2])
    x3 = Multiply()([x3, x3])

    x1_ = Multiply()([embeddings_1, embeddings_1])
    x2_ = Multiply()([embeddings_2, embeddings_2])
    x4 = Subtract()([x1_, x2_])

        #https://stackoverflow.com/a/51003359/10650182
    x5 = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([embeddings_1, embeddings_2])

    conc = Concatenate(axis=-1)([x5,x4, x3])

    x = Dense(100, activation="relu", name='conc_layer')(conc)
    x = Dropout(0.01)(x)
    out = Dense(1, activation="sigmoid", name = 'out')(x)

    model = Model([input_2, input_3], out)


    return model
model = build_model()
model.load_weights('cosine.h5')
def pre_process(X):
    lat, lon, category = X[:,0],X[:,1],X[:,2]
    seq =  tokenizer.texts_to_sequences(category)
    seq = pad_sequences(seq, padding=pad_type, truncating=trunc_type, maxlen=12)
    seq = np.concatenate([seq,np.reshape(lat,(-1,1))], axis = 1)
    seq = np.concatenate([seq,np.reshape(lon,(-1,1))], axis = 1)
    return seq
#pre_processed = pre_process(nm_df[nm_df['match']==1][['categories_1','latitude_1','longitude_1']].values)

def predict(df):
    match_list = []
    for values in tqdm(df.iterrows()):
        anchor, validate = preprocess_doubelets(np.array(values[1][1:4]).reshape(1,-1),np.array(values[1][5:]).reshape(1,-1))
        pred = model.predict([ anchor, validate])
        if pred > 0.3:
            match = 1
            
        else :
            match = 0
        match_list.append(match)
    df['match'] = match_list
    return df

In [None]:
df = predict(dataset)

In [None]:
temp = dict(list(df.groupby('id_1')))
temp2 = pd.DataFrame()
for k in temp.keys():
    print(k)
    eval_df = temp[k]
    eval_df['match_id'] = eval_df[eval_df['match']==1]['id_2']
    eval_df.fillna('NA',inplace = True)
    temp2 = pd.concat([eval_df,temp2],axis = 0)
temp2.head(15)

In [None]:
eval_df = temp2.groupby('id_1')['match_id'].\
                        apply(list).reset_index()
eval_df



# Prediction on test data set 

In [None]:
def matches(id1,list1):
    str1 = ' '.join(set(list1))
    str1 = str1.replace('NA','')
    
    str1 = re.sub(r' ','',str1)
    if str1 == '':
        str1 = id1
    else:
        str1 = id1+' '+str1
    return str1
matches(eval_df['id_1'].iloc[4],eval_df['match_id'].iloc[4])


l1 = list(starmap(matches,[(i,j) for i,j in zip(eval_df['id_1'].values,eval_df['match_id'].values)]))
eval_df['match'] = l1
eval_df = eval_df.drop(labels='match_id', axis=1)
eval_df