## NLP -  Phrase to Phrase Matching

[basemodel](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#)

[SOTA](https://www.kaggle.com/code/phantivia/uspppm-huggingface-train-inference-baseline)

[NLP ZeroToHero](https://www.kaggle.com/code/pavansanagapati/knowledge-graph-nlp-tutorial-bert-spacy-nltk/notebook)

table = """
A: Human Necessities
B: Operations and Transport
C: Chemistry and Metallurgy
D: Textiles
E: Fixed Constructions
F: Mechanical Engineering
G: Physics
H: Electricity
Y: Emerging Cross-Sectional Technologies
"""

1.0 - Very close match
0.75 - Close synonym
0.5 - Synonyms
0.25 - Somewhat related
0.0 - Unrelated

# Configs

In [None]:
import os
import datasets, transformers
import warnings, logging, torch

os.environ["WANDB_DISABLED"] = "true"

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class CFG:
    
    input_path = '../input/us-patent-phrase-to-phrase-matching/'

    model_path = '../input/patentsberta'
    #model_path = './model/clf'
    model_ckpt = 'patentsberta'
    
    num_labels =  5
    learning_rate = 2e-5
    weight_decay = 0.01
    
    epochs = 4
    batch_size = 32
    
    n_folds = 4

# Utilities

In [None]:
import re

def preprocess(batch):
    chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\”\\\\_]'
    batch['text']  = re.sub(chars_to_ignore_regex, '', batch['text'])
    return batch

In [None]:
from datasets import Dataset, Features, Value, ClassLabel
import string

def get_dds(df,test=False):
    sep = "[SEP]"
    df['section'] = df.context.str[0]
    
    df['sectok'] = '[' + df.section + ']'
    sectoks = list(df.sectok.unique())
    df['text'] = df.sectok + sep + df.context + sep + df.anchor + sep + df.target
    df['text'] = df.text.str.lower()

    if test:
        dataset = Dataset.from_pandas(df)
        dataset = dataset.map(tokenize, remove_columns= ['id', 'anchor', 'target', 'context', 'text'],batched=True, batch_size=None)
    else:
        df['score'] = df['score'].map({1.00:0,
                           0.75:1,
                           0.50:2,
                           0.25:3,
                           0.00:4 })
        df = df.rename(columns={'score': 'labels'})
        df = df.drop(['id','anchor','target','context'],1)
        df_text_genre = df[['text', 'labels','sectok']]
        class_names =  ['Very.close.match', 'Close.synonym','Synonyms', 'Somewhat.related','Unrelated']
        score_features = Features({'text': Value('string'), 'sectok':Value('string'), 'labels': ClassLabel(names=class_names)})
        dataset = Dataset.from_pandas(df_text_genre,features=score_features).train_test_split(test_size=0.2)
        dataset['train'], dataset['validation'] = dataset['train'].train_test_split(test_size=0.3).values() 
    return dataset

In [None]:
from datasets import DatasetDict 
def get_cv_dds(ds,folds,fold_num=0):
    
    trn,val = folds[fold_num]

    return DatasetDict({"train":df.select(trn), "test": df.select(val), "validation": ds["validation"]})

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6,6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()

In [None]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    #f1 = f1_score(labels, preds, average="weighted")
    #acc = accuracy_score(labels, preds)

    #return {"accuracy": acc, "f1 score": f1, "pearson": np.corrcoef(preds, labels)[0][1]}
    return {"pearson": np.corrcoef(preds, labels)[0][1]}

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

def get_model():
    num_labels = CFG.num_labels
    model = (AutoModelForSequenceClassification
            .from_pretrained(CFG.model_path, num_labels=num_labels)
            .to(device))
    return model

def get_trainer(dds, model=None):
    if model is None: model = get_model()
    batch_size = CFG.batch_size
    logging_steps = len(dds["train"]) //batch_size
    model_name= f"{CFG.model_ckpt}-finetuned-patent"
    training_args = TrainingArguments(output_dir=model_name,
                                      num_train_epochs=CFG.epochs,
                                      learning_rate=CFG.learning_rate,
                                      per_device_train_batch_size=batch_size,
                                      per_device_eval_batch_size=batch_size,
                                      weight_decay=CFG.weight_decay,
                                      evaluation_strategy="epoch",
                                      disable_tqdm=False,
                                      logging_steps=logging_steps,
                                      push_to_hub=False,
                                      log_level="error"
                                     )

    trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dds["train"],
                  eval_dataset=dds["validation"]
                    )
    return trainer

In [None]:
def to_predictions(y_preds):
    label = [0, 1, 2, 3, 4]
    score = [1.00, 0.75, 0.50,0.25,0.0 ]
    dictionary = dict(enumerate(score, 1))
    predictions = np.vectorize(dictionary.get)(np.digitize(y_preds, label))
    #predictions[:10], len(predictions)
    return predictions

In [None]:
def label_int2str(row):
    return dataset["train"].features["labels"].int2str(row)

In [None]:
from imblearn.combine import SMOTEENN

def balanceSOMTEENN(df):
    
    X = df["text"].reshape(-1, 1)
    y = df["labels"].to_numpy().reshape(-1, 1)
    
    smote_enn = SMOTEENN(random_state=0)

    X_resampled, y_resampled = smote_enn.fit_resample(X, y)

    y_resampled =y_resampled.reshape(-1, 1)
    
    tmp = np.concatenate((X_resampled,y_resampled), axis =1)
    
    df = pd.DataFrame(tmp, columns = ['text','labels'])

    return df

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

def balanceLabels(df):
    
    X = df["text"].to_numpy().reshape(-1, 1)
    y = df["labels"].to_numpy().reshape(-1, 1)


    ros = RandomOverSampler(random_state=0)

    X_resampled, y_resampled = ros.fit_resample(X, y)

    y_resampled =y_resampled.reshape(-1, 1)
    
    tmp = np.concatenate((X_resampled,y_resampled), axis =1)
    
    df = pd.DataFrame(tmp, columns = ['text','labels'])

    return df

# Exploratory Data Analysis

In [None]:
import pandas as pd

df = pd.read_csv(CFG.input_path + 'train.csv')
test_data = pd.read_csv(CFG.input_path + 'test.csv')

In [None]:
df.head()

In [None]:
dataset = get_dds(df)
dataset

In [None]:
dataset.map(preprocess)

In [None]:
sectok = np.unique(dataset["train"]["sectok"])

In [None]:
dataset.set_format(type="pandas")

In [None]:
import matplotlib.pyplot as plt

dataset["train"]["labels"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
df_train = balanceLabels(dataset["train"][:])
df_test = balanceLabels(dataset["test"][:])
df_validation = balanceLabels(dataset["validation"][:])

In [None]:
df_train.labels.value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
dataset.reset_format()

In [None]:
balance_dataset = DatasetDict({"train": Dataset.from_pandas(df_train)
                               ,"test":Dataset.from_pandas(df_test)
                               , "validation": Dataset.from_pandas(df_validation) })
balance_dataset

# Tokenize

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(CFG.model_path) 

In [None]:
tokenizer.add_special_tokens({'additional_special_tokens': list(sectok)})

In [None]:
dataset_encoded = balance_dataset.map(tokenize,batched=True, batch_size=None)

In [None]:
dataset_encoded

# Get Trainer

In [None]:
model = get_model()
trainer = get_trainer(dataset_encoded,model)

In [None]:
model.resize_token_embeddings(len(tokenizer))

# Evaluate

In [None]:
trainer.evaluate()

# Train

In [None]:
trainer.train()
# After imblanace library Pearson Score:750272 - only tran data , 0.849238, 0.846498, 0.8794302410586742

In [None]:
metrics = [o['eval_pearson'] for o in trainer.state.log_history if 'eval_pearson' in o]
metrics[-1]

In [None]:
# model.save_pretrained('./model/clf')
# .from_pretrained('./model/clf')

# Predict

In [None]:
dataset_encoded

In [None]:
outputs = trainer.predict(dataset_encoded["test"])
outputs.metrics

In [None]:
y_preds = np.argmax(outputs.predictions, axis=1)
y_valid = dataset_encoded["test"]["labels"]
labels = ['Very.close.match', 'Close.synonym','Synonyms', 'Somewhat.related','Unrelated']
# dataset_encoded["test"].features["labels"].names

In [None]:
len(y_preds), len(y_valid),len(labels)

In [None]:
plot_confusion_matrix(y_preds, y_valid,labels)

In [None]:
encoded_test = get_dds(test_data,test=True)
encoded_test

In [None]:
outputs = trainer.predict(encoded_test)
y_preds = np.argmax(outputs.predictions, axis=1)
predictions = to_predictions(y_preds)

In [None]:
predictions[:10],len(predictions)

# Submission

In [None]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': test_data['id'],
    'score': predictions,
})

submission.to_csv('submission.csv', index=False)