In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import warnings
import csv
import os
from spacy import displacy
try:
    import stylecloud
except:
    !pip install stylecloud
    import stylecloud
warnings.filterwarnings("ignore")
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, log_loss
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold 
from collections import Counter, defaultdict
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import normalized_mutual_info_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from mlxtend.classifier import StackingClassifier

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, auc, roc_curve

import xgboost as xgb
import lightgbm as lgb
try:
    import fasttreeshap
except:
    !pip install fasttreeshap
    import fasttreeshap

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
DS_DIR='../input/us-patent-phrase-to-phrase-matching'

test = pd.read_csv(DS_DIR +'/test.csv')
test.info

In [None]:
train = pd.read_csv(DS_DIR +'/train.csv')
train.info

In [None]:
train.drop(columns = ['id','anchor','target', 'context'], inplace=True)

In [None]:
cols = list(train.columns)
for i in cols:
    train[i] = train[i].apply(pd.to_numeric)
    print(i)

In [None]:
y_true = train['score']

In [None]:
y_true = list(map(int, y_true.values))

In [None]:
train.drop(columns='score', inplace=True)

In [None]:
X_train,X_test, y_train, y_test = train_test_split(train, y_true, stratify=y_true, test_size=0.3)

In [None]:
print("Number of data points in train data :",X_train.shape)
print("Number of data points in test data :",X_test.shape)

In [None]:
print("-"*10, "Distribution of output variable in train data", "-"*10)
train_distr = Counter(y_train)
train_len = len(y_train)
print("Class 0: ",int(train_distr[0])/train_len,"Class 1: ", int(train_distr[1])/train_len)
print("-"*10, "Distribution of output variable in train data", "-"*10)
test_distr = Counter(y_test)
test_len = len(y_test)
print("Class 0: ",int(test_distr[1])/test_len, "Class 1: ",int(test_distr[1])/test_len)

In [None]:
# This function plots the confusion matrices given y_i, y_i_hat.
def plot_confusion_matrix(test_y, predict_y):
    C = confusion_matrix(test_y, predict_y)
    
    A =(((C.T)/(C.sum(axis=1))).T)
    
    B =(C/C.sum(axis=0))
    plt.figure(figsize=(20,4))
    
    labels = [1,2]
    # representing A in heatmap format
    cmap=sns.light_palette("green")
    plt.subplot(1, 3, 1)
    sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Confusion matrix")
    
    plt.subplot(1, 3, 2)
    sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Precision matrix")
    
    plt.subplot(1, 3, 3)
    # representing B in heatmap format
    sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Recall matrix")
    
    plt.show()

In [None]:
predicted_y = np.zeros((test_len,2))
for i in range(test_len):
    rand_probs = np.random.rand(1,2)
    predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])
print("Log loss on Test Data using Random Model",log_loss(y_test, predicted_y, eps=1e-15))

predicted_y =np.argmax(predicted_y, axis=1)
plot_confusion_matrix(y_test, predicted_y)

In [None]:
# source of data: https://archive.ics.uci.edu/ml/datasets/superconductivty+data
DS_DIR='../input/us-patent-phrase-to-phrase-matching'

data = pd.read_csv(DS_DIR +'/train.csv', engine = "python")
train, test = train_test_split(data, test_size = 0.5, random_state = 0)
label_train = train["score"]
label_test = train["score"]
train = train.iloc[:, :-1]
test = test.iloc[:, :-1]
print("Training data has {} rows and {} columns.".format(train.shape[0], train.shape[1])) 
print("Testing data has {} rows and {} columns.".format(test.shape[0], test.shape[1])) 

In [None]:
fig = px.bar(y = np.unique(train["anchor"]),
x = [list(train["anchor"]).count(i) for i in np.unique(train["anchor"])] , 
            color = np.unique(train["anchor"]),
             color_continuous_scale="Emrld", 
             orientation='h',
             width=800, 
             height=4000) 
fig.update_xaxes(title="Anchor")
fig.update_yaxes(title = "Number of Rows")
fig.update_layout(showlegend = True,
    title = {
        'text': 'Anchor Type Distribution ',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        template="plotly_white")
fig.show()

In [None]:
fig = px.bar(y = np.unique(train["target"]),
x = [list(train["target"]).count(i) for i in np.unique(train["target"])] , 
            color = np.unique(train["target"]),
             color_continuous_scale="Emrld", 
             orientation='h',
             width=800, 
             height=4000) 
fig.update_xaxes(title="Target")
fig.update_yaxes(title = "Number of Rows")
fig.update_layout(showlegend = True,
    title = {
        'text': 'Target  Distribution ',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        template="plotly_white")
fig.show()

In [None]:
fig = px.bar(y = np.unique(train["context"]),
x = [list(train["context"]).count(i) for i in np.unique(train["context"])] , 
            color = np.unique(train["context"]),
             color_continuous_scale="Emrld", 
             orientation='h',
             width=800, 
             height=4000) 
fig.update_xaxes(title="Context")
fig.update_yaxes(title = "Number of Rows")
fig.update_layout(showlegend = True,
    title = {
        'text': 'Context  Distribution ',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        template="plotly_white")
fig.show()

In [None]:
train['anchor_word_count'] = train.anchor.str.split().str.len()
train['target_word_count'] = train.target.str.split().str.len()
train.head()

In [None]:
fig = px.histogram(data_frame= train,x = "anchor_word_count",  marginal="violin",nbins = 50 )
fig.update_layout(template="plotly_white")
fig.show()

In [None]:
fig = px.histogram(data_frame= train,x = "target_word_count",  marginal="violin",nbins = 50 )
fig.update_layout(template="plotly_white")
fig.show()

In [None]:
import random

def random_color():
        rand = lambda: random.randint(1, 255)
        return '#%02X%02X%02X' % (rand(), rand(), rand())

In [None]:
def generate_n_grams(text,ngram=1):
    words=[word for word in text.split()]
    temp=zip(*[words[i:] for i in range(0,ngram)])
    ans=[' '.join(ngram) for ngram in temp]
    return ans

# UNIGRAM
counts=defaultdict(int)
for text in train['anchor']:
    for word in generate_n_grams(text):
        counts[word]+=1
    
df=pd.DataFrame(sorted(counts.items(),key=lambda x:x[1],reverse=True))
pd1=df[0][:50]
pd2=df[1][:50]

plt.figure(1,figsize=(30,30))
sns.barplot(pd2,pd1, color=random_color())
plt.xlabel("Count", fontsize=20)
plt.ylabel("Words in dataframe", fontsize=20)
plt.title("Top 50 words in UNIGRAM ANALYSIS - Anchor", fontsize=30)
plt.tick_params(axis='both', labelsize=20)
plt.show()

# BIGRAM
counts=defaultdict(int)
for text in train['anchor']:
    for word in generate_n_grams(text, ngram=2):
        counts[word]+=1
    
df=pd.DataFrame(sorted(counts.items(),key=lambda x:x[1],reverse=True))
pd1=df[0][:50]
pd2=df[1][:50]

plt.figure(1,figsize=(30,30))
sns.barplot(pd2,pd1, color=random_color())
plt.xlabel("Count", fontsize=20)
plt.ylabel("Words in dataframe", fontsize=20)
plt.title("Top 50 words in BIGRAM ANALYSIS - Anchor ", fontsize=30)
plt.tick_params(axis='both', labelsize=20)
plt.show()

# TRIGRAM
counts=defaultdict(int)
for text in train['anchor']:
    for word in generate_n_grams(text, ngram=3):
        counts[word]+=1
    
df=pd.DataFrame(sorted(counts.items(),key=lambda x:x[1],reverse=True))
pd1=df[0][:50]
pd2=df[1][:50]

plt.figure(1,figsize=(30,30))
sns.barplot(pd2,pd1,color=random_color())
plt.xlabel("Count", fontsize=20)
plt.ylabel("Words in dataframe", fontsize=20)
plt.title("Top 50 words in TRIGRAM ANALYSIS - Anchor", fontsize=30)
plt.tick_params(axis='both', labelsize=20)
plt.show()

In [None]:
# UNIGRAM
counts=defaultdict(int)
for text in train['target']:
    for word in generate_n_grams(text):
        counts[word]+=1
    
df=pd.DataFrame(sorted(counts.items(),key=lambda x:x[1],reverse=True))
pd1=df[0][:50]
pd2=df[1][:50]

plt.figure(1,figsize=(30,30))
sns.barplot(pd2,pd1, color=random_color())
plt.xlabel("Count", fontsize=20)
plt.ylabel("Words in dataframe", fontsize=20)
plt.title("Top 50 words in UNIGRAM ANALYSIS - Target", fontsize=30)
plt.tick_params(axis='both', labelsize=20)
plt.show()

# BIGRAM
counts=defaultdict(int)
for text in train['target']:
    for word in generate_n_grams(text, ngram=2):
        counts[word]+=1
    
df=pd.DataFrame(sorted(counts.items(),key=lambda x:x[1],reverse=True))
pd1=df[0][:50]
pd2=df[1][:50]

plt.figure(1,figsize=(30,30))
sns.barplot(pd2,pd1, color=random_color())
plt.xlabel("Count", fontsize=20)
plt.ylabel("Words in dataframe", fontsize=20)
plt.title("Top 50 words in BIGRAM ANALYSIS - Target", fontsize=30)
plt.tick_params(axis='both', labelsize=20)
plt.show()

# TRIGRAM
counts=defaultdict(int)
for text in train['target']:
    for word in generate_n_grams(text, ngram=3):
        counts[word]+=1
    
df=pd.DataFrame(sorted(counts.items(),key=lambda x:x[1],reverse=True))
pd1=df[0][:50]
pd2=df[1][:50]

plt.figure(1,figsize=(30,30))
sns.barplot(pd2,pd1, color=random_color())
plt.xlabel("Count", fontsize=20)
plt.ylabel("Words in dataframe", fontsize=20)
plt.title("Top 50 words in TRIGRAM ANALYSIS - Target", fontsize=30)
plt.tick_params(axis='both', labelsize=20)
plt.show()

In [None]:
# UNIGRAM
counts=defaultdict(int)
for text in train['context']:
    for word in generate_n_grams(text):
        counts[word]+=1
    
df=pd.DataFrame(sorted(counts.items(),key=lambda x:x[1],reverse=True))
pd1=df[0][:50]
pd2=df[1][:50]

plt.figure(1,figsize=(30,30))
sns.barplot(pd2,pd1, color=random_color())
plt.xlabel("Count", fontsize=20)
plt.ylabel("Words in dataframe", fontsize=20)
plt.title("Top 50 words in UNIGRAM ANALYSIS - CONTEXT", fontsize=30)
plt.tick_params(axis='both', labelsize=20)
plt.show()


In [None]:
plt.figure(figsize=(20, 6))

sns.histplot(x='target_word_count', data=train, hue='anchor', bins=50, palette='rainbow')
plt.title('Distribution of target_word_count in Training Data', fontsize=15)

plt.show()

In [None]:
import wordcloud
wordcloud = wordcloud.WordCloud(stopwords=wordcloud.STOPWORDS, max_font_size=80, max_words=5000,
                      width = 600, height = 400,
                      background_color='gray').generate(' '.join(txt for txt in train["target"]))
fig, ax = plt.subplots(figsize=(14,10))
ax.imshow(wordcloud, interpolation='bilinear')
ax.set_axis_off()
plt.imshow(wordcloud)

In [None]:
import wordcloud
wordcloud = wordcloud.WordCloud(stopwords=wordcloud.STOPWORDS, max_font_size=80, max_words=5000,
                      width = 600, height = 400,
                      background_color='blue').generate(' '.join(txt for txt in train["target"]))
fig, ax = plt.subplots(figsize=(14,10))
ax.imshow(wordcloud, interpolation='bilinear')
ax.set_axis_off()
plt.imshow(wordcloud)

In [None]:
#code credit - https://www.kaggle.com/code/hasanbasriakcay/patent-phrase-matching-eda-fe-baseline

from IPython.core.display import HTML
def value_counts_all(df, columns):
    pd.set_option('display.max_rows', 50)
    table_list = []
    for col in columns:
        table_list.append(pd.DataFrame(df[col].value_counts()))
    return HTML(
        f"<table><tr> {''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list])} </tr></table>")

In [None]:
value_counts_all(train, ['anchor', 'target', 'context'])#, 'score'])

In [None]:
value_counts_all(test, ['anchor', 'target', 'context'])

In [None]:
context_dict = {
    'A': 'Human Necessities',
    'B': 'Operations and Transport',
    'C': 'Chemistry and Metallurgy',
    'D': 'Textiles',
    'E': 'Fixed Constructions',
    'F': 'Mechanical Engineering',
    'G': 'Physics',
    'H': 'Electricity',
    'Y': 'Emerging Cross-Sectional Technologies'
}

In [None]:
cpc_codes_df = pd.read_csv("../input/cpc-codes/titles.csv")
cpc_codes_df.head(10)

In [None]:
#code source - https://www.kaggle.com/code/hasanbasriakcay/patent-phrase-matching-eda-fe-baseline 

def create_feature(df, cpc_codes_df):
    import fuzzywuzzy
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    
    df['section'] = df['context'].str[:1]
    df['class'] = df['context'].str[1:]
    
    df['anchor_len'] = df['anchor'].apply(lambda x: len(x.split(' ')))
    df['target_len'] = df['target'].apply(lambda x: len(x.split(' ')))
    
    pattern = '[0-9]'
    mask = df['anchor'].str.contains(pattern, na=False)
    df['num_anchor'] = mask
    mask = df['target'].str.contains(pattern, na=False)
    df['num_target'] = mask
    
    df['context_desc'] = df['context'].map(cpc_codes_df.set_index('code')['title']).str.lower()
    
    fuzzy_anchor_target_scores = []
    fuzzy_anchor_context_scores = []
    fuzzy_taget_context_scores = []
    for index, row in df.iterrows():
        fuzzy_anchor_target_scores.append(fuzz.ratio(row['anchor'], row['target']))
        fuzzy_anchor_context_scores.append(fuzz.ratio(row['anchor'], row['context_desc']))
        fuzzy_taget_context_scores.append(fuzz.ratio(row['context_desc'], row['target']))
    df['fuzzy_at_score'] = fuzzy_anchor_target_scores
    df['fuzzy_ac_score'] = fuzzy_anchor_context_scores
    df['fuzzy_tc_score'] = fuzzy_taget_context_scores
    df['fuzzy_c_score'] = df['fuzzy_ac_score'] + df['fuzzy_tc_score']
    df['fuzzy_total'] = df['fuzzy_at_score'] + df['fuzzy_c_score']
    
    df.drop(['context', 'fuzzy_ac_score', 'fuzzy_tc_score'], 1, inplace=True)
    
    return df

In [None]:
new_train = create_feature(train.copy(), cpc_codes_df)
new_test = create_feature(test.copy(), cpc_codes_df)
new_train.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
fig, ax = plt.subplots(figsize=(16, 8))
sns.countplot(data=new_train, y='section', ax=ax, orientation='horizontal')
ax.set_yticklabels([context_dict['A'], context_dict['C'], context_dict['F'], context_dict['H'], context_dict['B'], 
                    context_dict['D'], context_dict['E'], context_dict['G']], rotation=0);

In [None]:
fig, ax = plt.subplots(figsize=(16, 32))

sns.countplot(data=new_train, y='class', ax=ax, orientation='horizontal')


In [None]:
#fig2, ax = plt.subplots(figsize=(16, 24))
g = sns.jointplot(data=new_train, x="target_len", y="fuzzy_total", ax=ax)
g.plot_joint(sns.kdeplot, color="r", zorder=0, levels=6)
g.plot_marginals(sns.rugplot, color="r", height=-.15, clip_on=False)

In [None]:
from IPython.display import Image
from nltk.corpus import stopwords

# Reference - https://www.kaggle.com/kapakudaibergenov/stylecloud/notebook
concat_data = ' '.join([i for i in train.anchor.astype(str)])
stylecloud.gen_stylecloud(text=concat_data,
                          icon_name='fas fa-book',
                          palette='cartocolors.qualitative.Bold_6',
                          background_color='black',
                          gradient='horizontal',
                          size=1024)


Image(filename="./stylecloud.png", width=1024, height=1024)

In [None]:
# Reference - https://www.kaggle.com/kapakudaibergenov/stylecloud/notebook
concat_data = ' '.join([i for i in train.target.astype(str)])
stylecloud2.gen_stylecloud(text=concat_data,
                          icon_name='fas fa-film',
                          palette='cartocolors.qualitative.Prism_10',
                          background_color='blue',
                          gradient='horizontal',
                          size=2048)


Image(filename="./stylecloud2.png", width=2048, height=2048)