In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Problem definition:

In this competition, we will try to build a model to match phrases in patent documents. The phrase matching will be in order to extract contextual information in which the phrase is present. Determining the semantic similarity between phrases is critically important during the patent search and examination process to determine if an invention has been described before. 

For example, if one invention claims "television set" and a prior publication describes "TV set", a model would ideally recognize these are the same and assist a patent attorney or examiner in retrieving relevant documents. This extends beyond paraphrase identification; if one invention claims a "strong material" and another uses "steel", that may also be a match. What counts as a "strong material" varies per domain (it may be steel in one domain and ripstop fabric in another, but you wouldn't want your parachute made of steel). 

# EDA:

In this dataset, there are pairs of phrases (an anchor and a target phrase) and has been asked to rate how similar they are on a scale from 0 (not at all similar) to 1 (identical in meaning). This challenge differs from a standard semantic similarity task in that similarity has been scored here within a patent's context, specifically its CPC classification (version 2021.05), which indicates the subject to which the patent relates. 

Score meanings:
The scores are in the 0-1 range with increments of 0.25 with the following meanings:

1. '1.0' - Very close match. This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”).
2. '0.75' - Close synonym, e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol".
3. '0.5' - Synonyms which don’t have the same meaning (same function, same properties). This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches.
4. '0.25' - Somewhat related, e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms.
5. '0.0' - Unrelated.


Columns in the dataset csv file:

1. id - a unique identifier for a pair of phrases
2. anchor - the first phrase
3. target - the second phrase
4. context - the CPC classification (version 2021.05), which indicates the subject within which the similarity is to be scored
5. score - the similarity. This is sourced from a combination of one or more manual expert ratings.


In [None]:
path = '/kaggle/input/us-patent-phrase-to-phrase-matching'

In [None]:
#import libraries
import pandas as pd
import numpy as np
pd.set_option("display.max_rows", None, "display.max_columns", None)
import matplotlib.pyplot as plt 
%matplotlib inline
from matplotlib.ticker import FuncFormatter

import seaborn as sns
 
import glob
import os


from wordcloud import WordCloud, STOPWORDS


import nltk
from nltk.corpus import stopwords


import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import CountVectorizer


In [None]:
train_df = pd.read_csv(path + '/train.csv')

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
train_df.nunique()

In [None]:
train_df.dtypes

In [None]:
# check for missing values: the data is free of missing values.
train_df.isnull().any().sum()


In [None]:
#distribution of score:
score_dist = train_df.groupby("score")['id'].size()
score_dist.plot(kind = 'bar')
plt.title('Score Distribution')
plt.show()

In [None]:
#distirbution of CPC classes / Context:
cpc_dist = train_df.groupby("context")['id'].size().sort_values()
plt.figure(figsize = (20,7))
cpc_dist.plot(kind = 'bar', )

plt.title('CPC Classes Distribution')
plt.show()

In CPC classification the first letter is represented as:

A: Human Necessities

B: Operations and Transport

C: Chemistry and Metallurgy

D: Textiles

E: Fixed Constructions

F: Mechanical Engineering

G: Physics

H: Electricity

Y: Emerging Cross-Sectional Technologies
This classification closely follows the International Patent Classification.

In [None]:
#Lets split the 'context'into 'section' and 'class' to two different columns:
train_df['section'] = train_df['context'].astype(str).str[0]
train_df['class'] = train_df['context'].astype(str).str[1:]

In [None]:
#Distribtuion of CPC 'section' and' class':

fig,ax = plt.subplots(2,1, figsize = (12,8))

sns.countplot(data = train_df, x = train_df['section'], ax=ax[0])

sns.countplot(data = train_df, x = train_df['class'], ax=ax[1])

plt.show()

# Analysis of Anchor and Target phrases:


In [None]:
# lets analyse the anchor columns "Top ten Anchor Words":
anchor_word_dist = train_df.groupby("anchor")['id'].size()
anchor_word_dist= anchor_word_dist.sort_values().tail(10)

plt.figure(figsize = (10,5))
anchor_word_dist.plot(kind = 'bar', )
plt.ylabel('frequency of word')
plt.title('Top Ten Anchor Words')
plt.show()

In [None]:
#A look at the target words show that they are in frequency of 1 or 2, 
#so next step is to look target associated with an anchor:

target_word_dist = train_df.groupby("target")['id'].size()
target_word_dist.head()

In [None]:
#lets take alook at the target aaociated with anchor = 'component composite coating' it anchors 127 targets!
train_df[train_df.anchor=='component composite coating'].head()

In [None]:

#add columns to 'train_df' which calculates the length of string in anchor (as anchor_len) and target (as target_len)
train_df['anchor_len'] = train_df['anchor'].astype(str).apply(len)
train_df['target_len'] = train_df['target'].astype(str).apply(len)

In [None]:
# the max lenght of target phrase is 98 while minimum is 2:
display(train_df.target_len.max())
display(train_df.target_len.min())

In [None]:
#Distribution of text length in 'target' phrase:
target_length = train_df['target_len'] 
fig, ax = plt.subplots(figsize=(10,5))

sns.distplot(target_length, bins = 50, ax = ax)
plt.show()

In [None]:
#to get an idea , we can filter with target_len == 2:
train_df[train_df['target_len'] == 2]

In [None]:
train_df[train_df['target_len'] == 98]

In [None]:

# the max length of anchor phrase is 38 while minimum is 3:
display(train_df.anchor_len.max())
display(train_df.anchor_len.min())

In [None]:
#Distribution of text length in 'anchor' phrase:
anchor_len = train_df['anchor_len'] 
fig, ax = plt.subplots(figsize=(10, 5))

sns.distplot(anchor_len, bins = 30, ax = ax)
plt.show()

In [None]:
train_df[train_df['anchor_len'] == 3].head(10)

In [None]:
train_df[train_df['anchor_len'] == 38].head(3)

In [None]:

#add columns to 'train_df' which calculates number of words of string in 'anchor' phrase (as anchor_word_count) and 'target' phrase(as target_word_count)
train_df["anchor_word_count"] = train_df["anchor"].apply(lambda x: len(x.split()))
train_df["target_word_count"] = train_df["target"].apply(lambda x: len(x.split()))

In [None]:
# distribution of word_count in 'anchor' phrase:
anchor_word_cnt = train_df["anchor_word_count"].value_counts()
plt.figure(figsize = (8,3))
anchor_word_cnt.plot(kind = 'bar', )
plt.ylabel('distribution')
plt.xlabel('Count of words')
plt.title('Word count distribution in Anchor phrase')
plt.show()

In [None]:
# distribution of word_count in 'target' phrase:
target_word_cnt = train_df["target_word_count"].value_counts()
plt.figure(figsize = (8,3))
target_word_cnt.plot(kind = 'bar' )
plt.ylabel('distribution')
plt.xlabel('Count of words')
plt.title('Word count distribution in Target phrase')
plt.show()

In [None]:
train_df.head()

In [None]:

#Distribution of Top ten 'anchor' phrases in CPC classes:
anchor_section = train_df.groupby(['section','anchor']).size()
anchor_section= anchor_section.sort_values().tail(10)
anchor_section= anchor_section.reset_index()
anchor_section.columns = ['section', 'anchor', 'count']

# plot a bar graph for 'score' in 'context':
plt.figure(figsize=(12, 5))
sns.barplot(x='count', y='section', hue='anchor', data=anchor_section, orient = 'h')
plt.xlabel('CPC Section')
plt.ylabel('Count')

# Wordcloud

Word Cloud is a data visualization technique used for representing text data in which the size of each word indicates its frequency or importance. Significant textual data points can be highlighted using a word cloud. Word clouds are widely used for analyzing data from social network websites.

We will write a simple and intuitive function plot_wordcloud that will help us plot wordclouds with ease.

In [None]:
# function to plot world cloud:
def plot_wordcloud(column, title):
    
    """
    Function to Plot Wordcloud of given dataframe column.
    
    params: column(string): The Column of the DataFrame for plotting.
            title(string) : The Title of the Wordcloud.
    """
    # Define stopwords
    stopwords = set(STOPWORDS) 
    
    # Define the Wordcloud    
    wordcloud = WordCloud(width = 800, 
                          height = 800,
                          background_color ='black',
                          min_font_size = 10,
                          stopwords = stopwords).generate(' '.join(train_df[column])) 

    # Plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title('Wordcloud: ' + title, fontsize = 20)

    plt.show() 

In [None]:
#Most frequent words in 'anchor':
plot_wordcloud(column = 'anchor', title = 'Most frequent words in anchor')

In [None]:
#Most frequent words in 'target':
plot_wordcloud(column = 'target', title = 'Most frequent words in anchor')

Summary:
1. As we can say that there are 36473 unique ids, in which 733 'anchor' or first phrases has been matched with 29340 'target' or second phrases. There fore we have 40 targets per anchor!

2. There are 106 CPC classifications in which the similarities has been scored. H01 (BASIC ELECTRIC ELEMENTS) and H04 (ELECTRIC COMMUNICATION TECHNIQUE), G01 (Physics) are the top most categories of patents which have been scored.

3. Section 'B' ('Operations and Transport'in the CPC classification), has the highest observations wheras Section 'D' (Textiles) has least number of observations. 


4. further EDA shows that:

   a. the max length of 'anchor' phrase is 38 while minimum is 3.
   
   b. the max length of 'target' phrase is 98 while minimum is 2.
   
   c. Number of words in anchor phrases ae 2-5, while 'target' phrases are 2-6 words. 
   
   d. Most of the 'anchor' and 'target' phrases are 2 words in length.


# Score analysis:
As we can see, less than 5% of phrases have a score of 1. 0.25 and 0.5 are most common scores.

So a detailed analysis on how features like CPC 'section' affects the score, shows that:
1. CPC section "B" has most Score matching. While CPC section 'D' and 'E' has the least phrase matching scores.
2. Most of the sections have a mean score of around 0.35
3. The observations with score of 0's is more than observations with 1's. Perfect scores are very less.
4. Section A and C have more 0's than 1 in their observations
5. The percentage of 0's is highest for H section after section B

How does target and abchor phrase effects the score of phrase matching?
1. Top anchor phrases show a score of 0.25 and 0.5.
2. Average of Target length is same across all the scores classes. Score == 0.25 and 0.5 involves longer 'target' phrases. 
3. Similarly average Anchor length is same across all the Score classes. Score == 0.25 and 0.5 involves longer 'Anchor' phrases. 



In [None]:
#% distribution of score:
score_dist = train_df.groupby("score")['id'].size()
score_dist *= 100 /score_dist.sum()

score_dist.plot(kind = 'bar')
plt.ylabel('%count')
plt.title('Score Distribution')
plt.show()

# Score and CPC sections:

In [None]:
# What is the relationship between 'score' of phrase matching and CPC 'section' in which it is matched?:
score_context= train_df.groupby(['section', 'score']).size()
score_context= score_context.reset_index()
score_context.columns = ['section', 'score', 'count']

# plot a bar graph for 'score' in 'context':
plt.figure(figsize=(18, 8))
sns.barplot(x='section', y='count', hue='score', data=score_context, palette="deep")
plt.xlabel('CPC Categories')
plt.ylabel('Count')

In [None]:
#A stacked bar chart to visualize the score distribution in CPC sections:
#credit: https://www.kaggle.com/code/valentinwerner/in-depth-eda-patentchallenge
scores_plot = train_df[train_df.score == 0].groupby(["section"]).id.count().reset_index()
scores_plot.columns = ["section","count_score_0"]
scores_plot = scores_plot.merge(train_df[train_df.score == 0.25].groupby(["section"]).id.count().reset_index(), on = "section")
scores_plot = scores_plot.merge(train_df[train_df.score == 0.50].groupby(["section"]).id.count().reset_index(), on = "section")
scores_plot = scores_plot.merge(train_df[train_df.score == 0.75].groupby(["section"]).id.count().reset_index(), on = "section")
scores_plot = scores_plot.merge(train_df[train_df.score == 1].groupby(["section"]).id.count().reset_index(), on = "section")
scores_plot = scores_plot.merge(train_df.groupby("section").id.count().reset_index(), on = "section")
scores_plot.columns = ["section", "count: score 0.0", "count: score 0.25", "count: score 0.50", "count: score 0.75", "count: score 1.0", "overall"]
scores_plot = scores_plot.sort_values("overall", ascending = False).set_index("section")
scores_plot.drop(columns = ["overall"], inplace = True)

#Creating the stacked barchart for scores
fig, ax =plt.subplots(figsize = (16,8))
scores_plot.plot(kind = "bar", stacked = True, ax = ax)
plt.legend(fontsize = 18)
#This plot underlines how rare perfect scores are and how very common 0.25 and 0.5 are as score.

# Scores and Length of 'anchor' and 'target' phrases:

In [None]:
#How does the target_len affects the score: Longer target phrase gives better scores!
fig = plt.figure(figsize = (10,5))
sns.boxplot(x = "score", y = "target_len", data = train_df, palette="deep")

In [None]:
# line plot showing the relationship between target length and score:
r= train_df.groupby(['target_len','score']).size()
r = r.reset_index()
r.columns = ['target_len', 'score', 'count']
#plot a bar graph :
plt.figure(figsize=(15, 5))
sns.lineplot(x='target_len', y='count', hue='score', data=r , palette= 'tab10',)
plt.xlabel('Count')
plt.ylabel('Target Phrase length')

In [None]:
# plot showing the relationship between mean target length and score:
target_len_mean = train_df.groupby(['score'])['target_len'].mean()
target_len_mean = target_len_mean.reset_index()
target_len_mean.columns = ['score', 'Average target length']
target_len_mean
#plot a bar graph :
plt.figure(figsize=(12, 4))
sns.barplot(data = target_len_mean, x = 'score', y= 'Average target length', palette="deep")
plt.show()

In [None]:
#How does the anchor_len affects the score:
fig = plt.figure(figsize = (10,5))
sns.boxplot(x = "score", y = "anchor_len", data = train_df, palette="deep")

In [None]:
# line plot showing the relationship between anchor length and score:
r= train_df.groupby(['anchor_len','score']).size()
r = r.reset_index()
r.columns = ['anchor_len', 'score', 'count']
#plot a bar graph :
plt.figure(figsize=(15, 5))
sns.lineplot(x='anchor_len', y='count', hue='score', data=r , palette= 'deep')
plt.xlabel('Count')
plt.ylabel('Anchor Phrase length')

In [None]:
# Average anchor length and score:
anchor_len_mean = train_df.groupby(['score'])['target_len'].mean()
anchor_len_mean = anchor_len_mean.reset_index()
anchor_len_mean.columns = ['score', 'Average Anchor length']
anchor_len_mean
#plot a bar graph :
plt.figure(figsize=(12, 4))
sns.barplot(data = anchor_len_mean, x = 'score', y= 'Average Anchor length', palette="deep")
plt.show()

# Sores and word count:

In [None]:
# What is the relationship between 'score' of phrase matching and count of words in 'Target' phrase?:
score_target_word_count= train_df.groupby(['target_word_count', 'score']).size()
score_target_word_count= score_target_word_count.reset_index()
score_target_word_count.columns = ['target_word_count', 'score', 'count']
score_target_word_count
# plot a bar graph for 'score' in 'context':
plt.figure(figsize=(15, 6))
sns.barplot(x='target_word_count', y='count', hue='score', data=score_target_word_count, palette="deep")
plt.xlabel('Target_word_count')
plt.ylabel('Count')

In [None]:
# What is the relationship between 'score' of phrase matching and count of words in 'anchor' phrase?:
score_an_word_count= train_df.groupby(['anchor_word_count', 'score']).size()
score_an_word_count= score_an_word_count.reset_index()
score_an_word_count.columns = ['anchor_word_count', 'score', 'count']
score_an_word_count
# plot a bar graph for 'score' in 'context':
plt.figure(figsize=(8, 5))
sns.barplot(x='anchor_word_count', y='count', hue='score', data=score_an_word_count, palette="deep")
plt.xlabel('anchor_word_count')
plt.ylabel('Count')

In [None]:
#How does the Top ten 'anchor'phrase effects the 'score':
anchor_score = train_df.groupby(['anchor', 'score']).size()
anchor_score = anchor_score.sort_values().tail(15)
anchor_score= anchor_score.reset_index()
anchor_score.columns = ['anchor', 'score', 'count']

#plot a bar graph :
plt.figure(figsize=(15, 8))
sns.set(font_scale = 1.5)
sns.barplot(x='count', y='anchor', hue='score', data=anchor_score, orient = 'h', palette="deep")
plt.xlabel('Count')
plt.ylabel('Anchor phrase')


# Word CLoud of 'anchor' and 'target' phrases in each score class:

Word Cloud for score == 1.0: Very close match. 

This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”).

In [None]:
#lets create a dataframe which has the data for score==1.0:
train_df_1 = train_df[train_df.score == 1.0]
train_df_1.head()

In [None]:
# function to plot world cloud for Anchor phrases which give a score of 1.0:
def plot_wordcloud_1(column, title):
    
    """
    Function to Plot Wordcloud of given dataframe column.
    
    params: column(string): The Column of the DataFrame for plotting.
            title(string) : The Title of the Wordcloud.
    """
    # Define stopwords
    stopwords = set(STOPWORDS) 
    
    # Define the Wordcloud    
    wordcloud = WordCloud(width = 800, 
                          height = 800,
                          background_color ='black',
                          min_font_size = 10,
                          stopwords = stopwords).generate(' '.join(train_df_1[column])) 

    # Plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title('Wordcloud: ' + title, fontsize = 20)

    plt.show() 

In [None]:
#Most frequent 'anchor' phrases with score== 1.0:
plot_wordcloud_1(column = 'anchor', title = 'Most frequent Anchor phrase in Score == 1.0')
#Most frequent target phrases with score == 1.0:
plot_wordcloud_1(column = 'target', title = 'Most frequent Target phrase in Score == 1.0')

Word cloud of score == 0.75: Close synonym

e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol".

In [None]:
#lets create a dataframe which has the data for score==0.75:
train_df_2 = train_df[train_df.score == 0.75]
train_df_2.head()

In [None]:
# function to plot world cloud for Anchor phrases which give a score of 0.75:
def plot_wordcloud_2(column, title):
    
    """
    Function to Plot Wordcloud of given dataframe column.
    
    params: column(string): The Column of the DataFrame for plotting.
            title(string) : The Title of the Wordcloud.
    """
    # Define stopwords
    stopwords = set(STOPWORDS) 
    
    # Define the Wordcloud    
    wordcloud = WordCloud(width = 800, 
                          height = 800,
                          background_color ='black',
                          min_font_size = 10,
                          stopwords = stopwords).generate(' '.join(train_df_2[column])) 

    # Plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title('Wordcloud: ' + title, fontsize = 20)

    plt.show() 

In [None]:
#Most frequent 'anchor' phrases with score== 0.75:
plot_wordcloud_2(column = 'anchor', title = 'Most frequent Anchor phrase in Score == 0.75')
#Most frequent target phrases with score == 0.75:
plot_wordcloud_2(column = 'target', title = 'Most frequent Target phrase in Score == 0.75')

Word cloud for score == 0.5: Synonyms which don’t have the same meaning (same function, same properties). 

This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches.

In [None]:
#lets create a dataframe which has the data for score==0.5:
train_df_3 = train_df[train_df.score == 0.5]
train_df_3.head()

In [None]:
# function to plot world cloud for Anchor phrases which give a score of 0.5:
def plot_wordcloud_3(column, title):
    
    """
    Function to Plot Wordcloud of given dataframe column.
    
    params: column(string): The Column of the DataFrame for plotting.
            title(string) : The Title of the Wordcloud.
    """
    # Define stopwords
    stopwords = set(STOPWORDS) 
    
    # Define the Wordcloud    
    wordcloud = WordCloud(width = 800, 
                          height = 800,
                          background_color ='black',
                          min_font_size = 10,
                          stopwords = stopwords).generate(' '.join(train_df_3[column])) 

    # Plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title('Wordcloud: ' + title, fontsize = 20)

    plt.show() 

In [None]:
#Most frequent 'anchor' phrases with score== 0.5:
plot_wordcloud_3(column = 'anchor', title = 'Most frequent Anchor phrase in Score == 0.5')
#Most frequent target phrases with score == 0.5:
plot_wordcloud_3(column = 'target', title = 'Most frequent Target phrase in Score == 0.5')

Word Cloud for score == '0.25' - Somewhat related

e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms.

In [None]:
#lets create a dataframe which has the data for score==0.25:
train_df_4 = train_df[train_df.score == 0.25]
train_df_4.head()

In [None]:
# function to plot world cloud for Anchor phrases which give a score of 0.25:
def plot_wordcloud_4(column, title):
    
    """
    Function to Plot Wordcloud of given dataframe column.
    
    params: column(string): The Column of the DataFrame for plotting.
            title(string) : The Title of the Wordcloud.
    """
    # Define stopwords
    stopwords = set(STOPWORDS) 
    
    # Define the Wordcloud    
    wordcloud = WordCloud(width = 800, 
                          height = 800,
                          background_color ='black',
                          min_font_size = 10,
                          stopwords = stopwords).generate(' '.join(train_df_4[column])) 

    # Plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title('Wordcloud: ' + title, fontsize = 20)

    plt.show() 

In [None]:
#Most frequent 'anchor' phrases with score== 0.25:
plot_wordcloud_4(column = 'anchor', title = 'Most frequent Anchor phrase in Score == 0.25')
#Most frequent target phrases with score == 0.25:
plot_wordcloud_4(column = 'target', title = 'Most frequent Anchor phrase in Score == 0.25')

Word cloud for score == O: Unrelated

In [None]:
#lets create a dataframe which has the data for score==0:
train_df_5 = train_df[train_df.score == 0]
train_df_5.head()

In [None]:
# function to plot world cloud for Anchor phrases which give a score of 0:
def plot_wordcloud_5(column, title):
    
    """
    Function to Plot Wordcloud of given dataframe column.
    
    params: column(string): The Column of the DataFrame for plotting.
            title(string) : The Title of the Wordcloud.
    """
    # Define stopwords
    stopwords = set(STOPWORDS) 
    
    # Define the Wordcloud    
    wordcloud = WordCloud(width = 800, 
                          height = 800,
                          background_color ='black',
                          min_font_size = 10,
                          stopwords = stopwords).generate(' '.join(train_df_5[column])) 

    # Plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title('Wordcloud: ' + title, fontsize = 20)

    plt.show() 

In [None]:
#Most frequent 'anchor' phrases with score==0:
plot_wordcloud_5(column = 'anchor', title = 'Most frequent Anchor phrase in Score == 0.0')

In [None]:
#Most frequent target phrases with score == 0:
plot_wordcloud_5(column = 'target', title = 'Most frequent Anchor phrase in Score == 0.0')

Score classes and word cloud: The most common words are,

1. Score==1.0 

'anchor'---> layer, break, wire, grid

'target'---> member, component, opening, layer

2. Score ==0.75

'anchor'---> display, supply, sheet, roller

'target'---> system, device, layer, material

3. Score == 0.5

'anchor'---> coating, component, composite

'target'---> device, sytem, signal, member, material

4. Score == 0.25

'anchor'---> perflouroalkyl, mechanism, displacement, group

'target'---> material, system, control, device

5. Score == 0.0

'anchor'---> portable, radio, waves, communication

'target'---> blood, system, water, data