In [None]:
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
import string
import re
from wordcloud import WordCloud , STOPWORDS
import matplotlib.pyplot as plt
from termcolor import colored
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

# Load Dataset

In [None]:
train_data=pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
test_data=pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')


# Explotory Data Analysis

In [None]:
train_data.head(10)

In [None]:
train_data.describe()

In [None]:
train_data.dtypes

In [None]:
train_data.info()

In [None]:
train_data = pd.DataFrame(train_data)
train_data.head()

In [None]:
train_data['anchor'].nunique()

In [None]:
train_data.isnull().sum()

In [None]:
train_data.shape

In [None]:
np.sort(train_data['context'].unique())

In [None]:
train_data['score'].unique()

In [None]:
#minimum and maximum length of anchor 
print(f"The maximum length of anchor feature is {train_data['anchor'].str.len().max()} and the minimum length of anchor feature is {train_data['anchor'].str.len().min()}")


In [None]:
train_data['anchor'] = train_data['anchor'].str.lower()
train_data['target'] = train_data['target'].str.lower()

In [None]:
#minimum and maximum length of target feature 
print(f"The maximum length of target feature is {train_data['target'].str.len().max()} and the minimum length of target feature is {train_data['target'].str.len().min()}")

In [None]:
anchor_text = train_data['anchor']
stopwords  = set(STOPWORDS)
wordcloud = WordCloud(width=800,
                      
                      height=800,
                      background_color='white',
                      min_font_size=10,
                      stopwords=stopwords,).generate(' '.join(anchor_text))

plt.figure(figsize=(8,8) , facecolor=None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
target_text = train_data['target']
stopwords = set(STOPWORDS)
wordcloud = WordCloud(width=800,
                      height=800,
                      background_color='white',
                      min_font_size=10,
                      stopwords=stopwords).generate(' '.join(target_text))
plt.figure(figsize=(8,8) , facecolor=None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
anchor_len = train_data['anchor'].str.split().str.len()

plt.hist(x=anchor_len,orientation='horizontal', color='blue')


In [None]:
target_len = train_data['target'].str.split().str.len()
plt.hist(x=target_len , orientation='vertical' , color='blue')

# K-means Clustering

In [None]:
#apply word tokenize and parts of speech tagging into sentence
def preprocess(text):
    stemmer = WordNetLemmatizer()
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join([i for i in nopunc if not i.isdigit()])
    nopunc =  [word.lower() for word in nopunc.split() if word not in stopwords.words('english')]
    return [stemmer.lemmatize(word) for word in nopunc]

In [None]:
target = ''.join(train_data['target'])
target_1 = preprocess(target)

In [None]:
tfidconverter = TfidfVectorizer()
X = tfidconverter.fit_transform(target_1)

In [None]:
#preparing the model
cluster=9
model = KMeans(n_clusters=9,init='k-means++',max_iter=100,n_init=1)
model.fit(X)

In [None]:
print('Top terms of cluster:')
order_centroids = model.cluster_centers_.argsort()[::-1]
terms = tfidconverter.get_feature_names()
for i in range(cluster):
    print("cluster %d" %i)
    for centroid in order_centroids[i,:10]:
        print(" %s "%terms[centroid])
        print()

In [None]:
#preparing for the test set
test = ''.join(test_data['target'])
test_target = preprocess(test)
Y= tfidconverter.transform(test_target)

In [None]:
print("Prediction\n")
prediction = model.predict(Y)
print(prediction)