# Import Libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.preprocessing import OrdinalEncoder
# import tensorflow as tf

# from tensorflow import keras
# from tensorflow.keras.layers import Dense, Flatten, Dropout
# from tensorflow.keras import utils
# from tensorflow.keras.preprocessing.text import Tokenizer

# Data Loading

## Score meanings
The scores are in the 0-1 range with increments of 0.25 with the following meanings:

**1.0** - Very close match. This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”).

**0.75** - Close synonym, e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol".

**0.5** - Synonyms which don’t have the same meaning (same function, same properties). This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches.

**0.25** - Somewhat related, e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms.

**0.0** - Unrelated.

 to Phrase Matching## Files
**train.csv** - the training set, containing phrases, contexts, and their similarity scores

**test.csv** - the test set set, identical in structure to the training set but without the score

**sample_submission.csv** - a sample submission file in the correct format

## Columns

**id** - a unique identifier for a pair of phrases

**anchor** - the first phrase

**target** - the second phrase

**context** - the CPC classification (version 2021.05), which indicates the subject within which the similarity is to be scored

**score** - the similarity. This is sourced from a combination of one or more manual expert ratings.

In [None]:
train_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv", index_col="id")
test_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv", index_col="id")
sample_submission_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv")

y_col = "score"
cols = test_df.columns
train_index = train_df.index
test_index = test_df.index
y = train_df.score
y2D = train_df[["score"]]

In [None]:
sample_submission_df.dtypes

In [None]:
print(f"Total Train samples: {train_df.shape}")
print(f"Toal Test samples: {test_df.shape}")
print(f"Sample submission data shape: {sample_submission_df.shape}")

print()

print(f"anchor col len: {train_df.anchor.nunique()}")
print(f"target col len: {train_df.target.nunique()}")
print(f"context col len: {train_df.context.nunique()}")

# Explore Data

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
X = train_df.copy()
X_test = test_df.copy()

### Verifying columns on cardinality

In [None]:
def verify_col(X1, X2, col):
    arr = []

    for i in [i for i in X2[col].unique()]:
        if i in X1[col].unique():
            arr.append(True)
        else:
            arr.append(False)

    return not all(arr)

In [None]:
print(verify_col(X, X_test, "anchor"))
print(verify_col(X, X_test, "target"))
print(verify_col(X, X_test, "context"))

# Data Preprocessing

### Encode y

In [None]:
enc_y = OrdinalEncoder()
enc_y.fit(y2D)

y = enc_y.transform(y2D)

### Encode DataFrame

In [None]:
enc = OrdinalEncoder()
enc.fit(X.drop(y_col, axis=1))

X = pd.DataFrame(enc.transform(X.drop(y_col, axis=1)))
X.columns = cols
X.index = train_index
X["score"] = y

X_test = pd.DataFrame(enc.transform(X_test))
X_test.columns = cols
X_test.index = test_index

In [None]:
X.head()

In [None]:
X_test.head()

# Modeling

In [None]:
def encode(X_train, X_valid, X_test):
    
    y_train = X_train.score
    y_valid = X_valid.score
    
    enc = OrdinalEncoder()
    enc.fit(X.drop(y_col, axis=1))
    
    train_index = X_train.index
    valid_index = X_valid.index
    test_index = X_test.index
    
    
    X_train = pd.DataFrame(enc.transform(X_train.drop(y_col, axis=1)))
    X_train.columns = cols
    X_train.index = train_index
    X_train["score"] = y_train
    
    X_valid = pd.DataFrame(enc.transform(X_valid.drop(y_col, axis=1)))
    X_valid.columns = cols
    X_valid.index = valid_index
    X_valid["score"] = y_valid

    X_test = pd.DataFrame(enc.transform(X_test))
    X_test.columns = cols
    X_test.index = test_index
    
    return X_train, X_valid, X_test

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

scores = []
val_preds = []
test_preds = []

for i, (train, valid) in enumerate(kfold.split(X=X, y=y)):
    test = X_test.copy()
    X_train = X.iloc[train]
    X_valid = X.iloc[valid]
    y_train = X_train.pop("score")
    y_valid = X_valid.pop("score")
    
    model = RandomForestRegressor(n_estimators=100)
    model.fit(X_train, y_train)
    
    val_pred = model.predict(X_valid)
    val_preds.append(val_preds)
    loss = mean_squared_error(y_valid, model.predict(X_valid), squared=False)
    scores.append(loss)
    
    test_pred = model.predict(X_test)
    test_preds.append(test_pred)
    
    print(f"{i}: {loss}")
    
print("----------CONCLUSION-------------")
print(np.mean(scores))

In [None]:
preds = np.mean(np.column_stack(test_preds), axis=1).round()

In [None]:
X_test

In [None]:
ss = sample_submission_df
ss.score = enc_y.inverse_transform(preds.reshape(-1, 1))
ss['id'] = X_test.index
ss

In [None]:
ss.score.dtype

In [None]:
ss.to_csv("submission.csv", index=False)

In [None]:
pd.read_csv("./submission.csv").dtypes