In [None]:
import pandas as pd

from pandas_profiling import ProfileReport

In [None]:
train_df = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')
train_df['score'] = train_df['score'].astype(float)
test_df = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv')
test_id = test_df['id']

# EDA

### We can use the pandas-profiling lib to get some insights about the data

In [None]:
profile = ProfileReport(train_df)
profile

# Baselines

### Baseline 1 - Just a guess of the two most common values (0.5 and 0.25)

In [None]:
def baseline_result(df):
  def even_odd(row):
    if row.number % 2 == 0:
      return 0.50
    else:
      return 0.25
  df['number'] = pd.Series(range(0,df.shape[0]))
  df['pred']  = df.apply(lambda x: even_odd(x), axis=1)
  correlation = df['pred'].corr(df['score'], method='pearson')
  return correlation
  
train_corr = baseline_result(train_df)
# test_corr = baseline_result(test_df)

print(f'Train pearson correlation {train_corr:.4f}')

### Baseline 2 - Fuzzywuzzy Similarity - Syntactic

In [None]:
from fuzzywuzzy import fuzz

def round_values(x):
    if x >= 0 and x < 0.125:
        return 0
    if x >= 0.125 and x < 0.375:
        return 0.25
    if x >= 0.375 and x < 0.625:
        return 0.5
    if x >= 0.625 and x < 0.875:
        return 0.75
    if x >= 0.875 and x <= 1:
        return 1.0

train_df['fuzzy'] = train_df.apply(lambda x: fuzz.partial_ratio(x['anchor'], x['target'])/100, axis=1)
train_df['fuzzy_rounded'] = train_df.apply(lambda x: round_values(x['fuzzy']), axis=1)

train_corr = train_df['fuzzy'].corr(train_df['score'], method='pearson')
print(f'Train pearson correlation {train_corr:.4f}')

train_corr = train_df['fuzzy_rounded'].corr(train_df['score'], method='pearson')
print(f'Train pearson correlation for rounded similarity {train_corr:.4f}')

### Baseline 3 - Semantic Similarity using Scipy models and distances

In [None]:
import spacy
from scipy.spatial import distance

nlp = spacy.load("en_core_web_lg")

train_df['anchor_embedding'] = train_df.apply(lambda x: nlp(x['anchor']).vector, axis=1)
train_df['target_embedding'] = train_df.apply(lambda x: nlp(x['target']).vector, axis=1)

test_df['anchor_embedding'] = test_df.apply(lambda x: nlp(x['anchor']).vector, axis=1)
test_df['target_embedding'] = test_df.apply(lambda x: nlp(x['target']).vector, axis=1)

train_df['similarity'] = train_df.apply(lambda x: distance.cosine(x['anchor_embedding'], x['target_embedding']), axis=1)

test_df['similarity'] = test_df.apply(lambda x: distance.cosine(x['anchor_embedding'], x['target_embedding']), axis=1)

In [None]:
train_df['cosine_distance'] = train_df.apply(lambda x: 1 - distance.cosine(x['anchor_embedding'], x['target_embedding']), axis=1)
train_df['cosine_distance_rounded'] = train_df.apply(lambda x: round_values(x['cosine_distance']), axis=1)
train_df['euclidean_distance'] = train_df.apply(lambda x: distance.euclidean(x['anchor_embedding'], x['target_embedding']), axis=1)
train_df['euclidean_distance_rounded'] = train_df.apply(lambda x: round_values(x['euclidean_distance']), axis=1)

train_corr = train_df['cosine_distance'].corr(train_df['score'], method='pearson')
print(f'Train pearson correlation for cosine distance {train_corr:.4f}')

train_corr = train_df['cosine_distance_rounded'].corr(train_df['score'], method='pearson')
print(f'Train pearson correlation for cosine distance for rounded values {train_corr:.4f}')

train_corr = train_df['euclidean_distance'].corr(train_df['score'], method='pearson')
print(f'Train pearson correlation for euclidean distance {train_corr:.4f}')

train_corr = train_df['euclidean_distance_rounded'].corr(train_df['score'], method='pearson')
print(f'Train pearson correlation for euclidean distance for rounded values {train_corr:.4f}')


test_df['cosine_distance'] = test_df.apply(lambda x: 1 - distance.cosine(x['anchor_embedding'], x['target_embedding']), axis=1)

In [None]:
submission_df = pd.DataFrame()
submission_df['id'] = test_id
submission_df['score'] = test_df['cosine_distance']

In [None]:
submission_df.to_csv('submission.csv', index=False, header=True)