In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import pickle

import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error
plt.style.use('fivethirtyeight')
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
df = pd.read_csv("train.csv")

In [8]:
X = df[["name_1", "name_2"]]

In [11]:
y = df.is_duplicate.values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=69)

In [18]:
X_train["is_duplicate"] = y_train

In [19]:
X_test["is_duplicate"] = y_test

In [25]:
X_train.to_csv("train.csv")

In [26]:
X_test.to_csv("test.csv")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.33, random_state=69)

In [2]:


lowercase = lambda string: string.lower()

name_1 = [i.lower() for i in df.name_1.values]
name_2 = [i.lower() for i in df.name_2.values]

united_str = " ".join(name_1 + name_2)

unique_symbols = set()
for char in united_str:
    unique_symbols.add(char)

char_mapper =  {ch:i for i, ch in enumerate(list(unique_symbols))}
char_unmapper = {i:ch for i, ch in enumerate(list(unique_symbols))}

max_len = max(list(map(len, name_1 + name_2)))

# let the blank be 255 chars len
#164 to map empty cells
name_1_blank = [max(char_mapper.values()) + 1] * 255
name_2_blank = [max(char_mapper.values()) + 1] * 255

def name_to_embedding(name: str) -> list:
    embedding = [max(char_mapper.values()) + 1] * 255
    for i, char in enumerate(name):
        embedding[i] = char_mapper[char]
    return embedding

embeddings_list_1 = [name_to_embedding(name) for name in  name_1]
embeddings_list_2 = [name_to_embedding(name) for name in  name_2]

features = [e1 + e2 for e1, e2 in zip(embeddings_list_1, embeddings_list_2)]
features = np.array(features)

targets = df.is_duplicate.values

In [3]:
with open("char_mapper.pkl", "wb") as f:
    pickle.dump(char_mapper, f)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.33, random_state=69)

### XGBOOST

In [5]:
reg = xgb.XGBRegressor(n_estimators=1000)
_ = reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            early_stopping_rounds=50,
           verbose=False)
prediction = reg.predict(X_test)



KeyboardInterrupt: 

In [None]:
from sklearn.metrics import f1_score

In [None]:
best_tresh, best_f1 = 0, 0
for thresh in tqdm(range(0, 100)):
    thresh /= 100
    pred = [1 if x > thresh else 0  for x in prediction]
    f1 = f1_score(y_test, pred)
    if f1 > best_f1:
        best_tresh = thresh

In [None]:
print(best_tresh)

In [None]:
pred = [1 if x > best_tresh else 0  for x in prediction]

In [None]:
print(classification_report(y_test, pred))

In [None]:
def name_to_embedding(name: str) -> list:
    embedding = [max(char_mapper.values()) + 1] * 255
    for i, char in enumerate(name):
        embedding[i] = char_mapper[char]
    return embedding

In [None]:
def compare_names(name_1, name_2):
    embed = np.array(name_to_embedding(name_1.lower()) + name_to_embedding(name_2.lower())).reshape(1,-1)
    prediction = reg.predict(embed)
    best_tresh = .99
    pred = ["Одинаковое название" if x > best_tresh else "Другое название"  for x in prediction]
    return pred[0]

In [None]:
name_1 = "Pirelli Neumaticos S.A.I.C."
name_2 = "Pirelli Tyre Co., Ltd."
compare_names(name_1, name_2)

In [None]:
name_1 = "Pirelli Neumaticos S.A.I.C."
name_2 = "asjh ajhsf iajshf has"
compare_names(name_1, name_2)

### NN

In [None]:
EPOCHS = 20
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [None]:
## train data
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = TrainData(torch.FloatTensor(X_train), 
                       torch.FloatTensor(y_train))
## test data    
class TestData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = TestData(torch.FloatTensor(X_test))

In [None]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 12.
        self.layer_1 = nn.Linear(features.shape[1], 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
model = BinaryClassification()
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

In [None]:
from tqdm import tqdm

In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in tqdm(test_loader):
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
print(classification_report(y_test, y_pred_list))

In [None]:
confusion_matrix(y_test, y_pred_list)

# Inference

In [None]:
with open("char_mapper.pkl", "rb") as f:
    char_mapper = pickle.load(f)

In [None]:
name_1 = "Pirelli Neumaticos S.A.I.C."
name_2 = "Pirelli Tyre Co., Ltd."

In [None]:
def name_to_embedding(name: str) -> list:
    embedding = [max(char_mapper.values()) + 1] * 255
    for i, char in enumerate(name):
        embedding[i] = char_mapper[char]
    return embedding

In [None]:
def compare_names(name_1, name_2):
    input_ = torch.FloatTensor(np.array(name_to_embedding(name_1.lower()) + name_to_embedding(name_2.lower())).reshape(1,-1)).to("cuda")
    output = model(input_)
    with torch.no_grad():
        y_test_pred = torch.sigmoid(output)
        y_pred_tag = torch.round(y_test_pred)
    ans = y_pred_tag[0][0]
    return int(ans)

In [None]:
name_1 = "Pirelli Neumaticos S.A.I.C."
name_2 = "Pirelli Tyre Co., Ltd."
compare_names(name_1, name_2)

In [None]:
name_1 = "Pirelli Neumaticos S.A.I.C."
name_2 = "Pirelli Neumaticos S.A.I.C."
compare_names(name_1, name_2)

In [None]:
name_1 = "Pirelli Neumaticos S.A.I.C."
name_2 = "asjh ajhsf iajshf has"
compare_names(name_1, name_2)

In [None]:
name_1 = "Pirelli Neumaticos S.A.I.C."
name_2 = "asjh ajAEWRfq3w 9jshf has"
compare_names(name_1, name_2)