In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
""" 1. Check the number of samples, columns, etc
    2. Check the maximum and minimum length of samples
    3. Check if some columns have null values and/or are empty or not
    4. Check whether anchor and target have numbers or any other characters than A-Za-z or not.
    5. Plot number of samples across each class (label)
    6. Plot number of samples across each cpc category
    7. Check how many anchor/target column values are unique or not individually 
    8. Word cloud for target, anchor and context column category
    9. find maximum count of targets across anchors (number of targets across a single anchor)
   10. does anchor column has values that are also present in targets
"""

# Reading CSV Files

In [None]:
train_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
test_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")
spl_sub_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv")

# 1. Checking out the data

In [None]:
print("Shape of Training Data (train.csv):: Rows: {0}, Columns: {1} \n".format(len(train_df),len(train_df.columns)))
print("----------- Training Data -------------\n")
train_df.head(5)

In [None]:
print("Shape of Test Data (test.csv):: Rows: {0}, Columns: {1} \n".format(len(test_df),len(test_df.columns)))
print("----------- Test Data -------------\n")
test_df.head(5)

In [None]:
print("Shape of Sample Submission Data (sample_submission.csv):: Rows: {0}, Columns: {1} \n".format(len(spl_sub_df),len(spl_sub_df.columns)))
print("----------- Sample Submission Data -------------\n")
spl_sub_df.head(5)

In [None]:
print("Training Data Columns")
print(*list(train_df.columns),sep='\n')

In [None]:
print("Testing Data Columns")
print(*list(test_df.columns),sep='\n')

In [None]:
print("Sample Submission Columns")
print(*list(spl_sub_df.columns),sep='\n')

# Data Columns:
    1. id: unique identifier
    2. anchor: phrase 1
    3. target: phrase 2
    4. context: cpc classification (to be joined with the cpc dataset)
    5. score: similarity score between 0 to 1


# 2. Check the maximum and minimum length of samples

## 2.1. Anchor Column

In [None]:
train_df['anchor_length'] = train_df['anchor'].apply(len)
train_value_counts = train_df['anchor_length'].value_counts().to_dict()
test_df['anchor_length'] = test_df['anchor'].apply(len)
test_value_counts = test_df['anchor_length'].value_counts().to_dict()


In [None]:
def barPlot(x_values_raw: dict.keys,y_values_raw: dict.values, x_label: str, y_label: str, plot_title: str,options: dict = {}) -> None:
    x_values = list(x_values_raw)
    y_values = list(y_values_raw)
    fig = plt.figure(figsize = (10, 5))
    #  Bar plot
    if options.get('width',None) is not None:
        plt.bar(x_values, y_values, color ='green',width=options.get('width',None))
    else:
        plt.bar(x_values, y_values, color ='green')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(plot_title)
    if options.get('overlap',False) is True:
        plt.xticks(rotation=90, ha='right')
    plt.show()
    
   
 

### 2.1.1 Training Data

In [None]:
barPlot(train_value_counts.keys(),train_value_counts.values(), "Length of Training Samples", "Count of Training Samples","Length Vs Count of Training Samples (Anchor Column)")

### 2.1.2 Test Data

In [None]:
barPlot(test_value_counts.keys(),test_value_counts.values(), "Length of Test Samples", "Count of Test Samples","Length Vs Count of Test Samples (Anchor Column)")

## 2.2. Target Column

In [None]:
train_df['target_length'] = train_df['target'].apply(len)
train_value_counts_target = train_df['target_length'].value_counts().to_dict()
test_df['target_length'] = test_df['target'].apply(len)
test_value_counts_target = test_df['target_length'].value_counts().to_dict()

### 2.2.1 Training Data

In [None]:
barPlot(train_value_counts_target.keys(),train_value_counts_target.values(), "Length of Training Samples", "Count of Training Samples","Length Vs Count of Training Samples (Target Column)")

### 2.2.2 Test Data

In [None]:
barPlot(test_value_counts_target.keys(),test_value_counts_target.values(), "Length of Test Samples", "Count of Test Samples","Length Vs Count of Test Samples (Target Column)")

## 2.3: Context Column

In [None]:
train_df['context_char_length'] = train_df['context'].apply(lambda x: len([i for i in x]))
test_df['context_char_length'] = test_df['context'].apply(lambda x: len([i for i in x]))

### 2.3.1: Training Data

In [None]:
train_df['context_char_length'].value_counts()

### 2.3.2: Test Data

In [None]:
test_df['context_char_length'].value_counts()

## Some Important Insights Gained:
  ### Target and Anchor Columns:
        1. On checking the plots for training and testing data for both columns, we see that there are no data points that are empty thus getting rid of the possibility that a string can be empty in this dataset
 ### Context Column:
     1. Since we are checking the character length for context column, and the number of samples for both training and test datasets equal to the rows of these datasets respectively. This means that there are no empty/missing values of any kind for this column

# 3. Explore the datatypes and check if columns have null values and/or are empty or not

```PseudoCode:
    for dataset in [training,test]:
        1. check the datatypes for dataset
        2. check if dataset have columns having null values
        3. check if dataset have columns having None values
        4. check if data in columns of string datatype for a dataset are present as empty strings or not (already checked)
        ```

In [None]:
from collections import Counter
def isEmpty(df: pd.DataFrame,column: str):
    res = df[column].apply(lambda x: x == '').tolist()
    c = Counter(res)
    return c[True]
    

## 3.1 Training Data

### 3.1.1 Datatypes

In [None]:
train_df.dtypes

### 3.1.2 Columns having null values

In [None]:
train_df.isnull().sum()

### 3.1.3 Columns having NA/None values

In [None]:
train_df.isna().sum()

### 3.1.4 Columns having '' as values

In [None]:
res = {}
toTest = ['anchor','target','context']
for col in toTest:
    res[col] = isEmpty(train_df,col)
    
pd.DataFrame([res])

## 3.2 Test Data

### 3.2.1 Datatypes

In [None]:
test_df.dtypes

### 3.2.2 Columns having null values

In [None]:
test_df.isnull().sum()

### 3.2.3 Columns having NA/None values

In [None]:
test_df.isna().sum()

### 3.2.4 Columns having '' as values

In [None]:
res = {}
toTest = ['anchor','target','context']
for col in toTest:
    res[col] = isEmpty(test_df,col)
    
pd.DataFrame([res])

## Some Important Insights Gained
### There are no missing values in anchor, target and context columns for both training and test datasets

# 4. Check whether anchor and target have numbers or any other characters than A-Za-z or not.

## 4.1 Training Data

In [None]:
toTest = ['anchor','target']
def isAlphaNumeric(df:pd.DataFrame,column: str):
    x = df[column].apply(lambda x: x.isalnum()).tolist()
    c = Counter(x)
    return c[False]

def isNumeric(df:pd.DataFrame,column: str):
    
    x = df[column].apply(lambda x: any([char.isdigit() for char in x])).tolist()
    c = Counter(x)
    return c[True]

def isAlpha(df:pd.DataFrame,column: str):
    x = df[column].apply(lambda x: any([char.isalpha() for char in x])).tolist()
    c = Counter(x)
    return c[True]

### 4.1.1 Check if anchor and target columns have numbers and A-Za-z only

In [None]:
r = {}
for col in toTest:
    r[col] = isAlphaNumeric(train_df,col)

print("The following result shows that for the anchor and target columns in the training dataset, following number of rows have special characters\n")
print(r)

### 4.1.2 Check if anchor and target columns have numbers also as part of text

In [None]:
r = {}
for col in toTest:
    r[col] = isNumeric(train_df,col)

print("The following result shows that for the anchor and target columns in the training dataset, following number of rows have numbers also as part of text \n")
print(r)

### 4.1.3 Check if anchor and target columns have text included(not only numbers)

In [None]:
r = {}
for col in toTest:
    r[col] = isAlpha(train_df,col)

print("The following result shows that for the anchor and target columns in the training dataset, following number of rows have text only \n")
print(r)

## Some Important Insights Gained:

##### There are special characters present in anchor and text columns including numbers, but each sample has text as part of it, there are no samples which only have numbers or special characters

## 4.2 Testing Data

### 4.2.1 Check if anchor and target columns have numbers and A-Za-z only

In [None]:
r = {}
for col in toTest:
    r[col] = isAlphaNumeric(test_df,col)

print("The following result shows that for the anchor and target columns in the testing dataset, following number of rows have special characters\n")
print(r)

### 4.2.2 Check if anchor and target columns have numbers also as part of text

In [None]:
r = {}
for col in toTest:
    r[col] = isNumeric(test_df,col)

print("The following result shows that for the anchor and target columns in the testing dataset, following number of rows have numbers also as part of text \n")
print(r)

### 4.2.3 Check if anchor and target columns have text included (not only numbers)

In [None]:
r = {}
for col in toTest:
    r[col] = isAlpha(test_df,col)

print("The following result shows that for the anchor and target columns in the testing dataset, following number of rows have text only \n")
print(r)

## Some Important Insights Gained:
There are special characters present in anchor and target columns, no numbers are present, furthermore, each sample has text as part of it, there are no samples which only have numbers or special characters

# 5. Plot number of samples across each class (label)

In [None]:
scoreCount = dict(sorted(train_df['score'].value_counts().to_dict().items()))
barPlot(list(map(str,scoreCount.keys())),scoreCount.values(),'Classes','Sample Count','Plot: Classes vs Sample Count')

## Some Important Insights Gained:

This problem can be solved in two ways, either we say that it is a multi-class classification problem or a regression problem, 
In context of a multi-class classification problem, there are imbalanced classes, so some kind of data augmentation may be in order

# 6. Plot number of samples across each cpc category

## 6.1 Training Data

### 6.1.1 Top 20 most frequent CPC Categories

In [None]:
cpcSampleCount = train_df['context'].value_counts().to_dict()
cpcSampleCountSorted = dict(sorted(cpcSampleCount.items(), key=lambda item: item[1],reverse=True))
top20 = {}
c = 0
for k,v in cpcSampleCountSorted.items():
    if c == 20:
        break
    top20[k] = v
    c+=1
barPlot(top20.keys(),top20.values(),'CPC Category','Sample Count','TOP-20:: CPC Category vs Sample Count in Training Set')

### 6.1.2 From Bottom:: 20 least frequent CPC Categories

In [None]:
cpcSampleCount = train_df['context'].value_counts().to_dict()
cpcSampleCountSorted = dict(sorted(cpcSampleCount.items(), key=lambda item: item[1]))
bottom20 = {}
c = 0
for k,v in cpcSampleCountSorted.items():
    if c == 20:
        break
    bottom20[k] = v
    c+=1
barPlot(bottom20.keys(),bottom20.values(),'CPC Category','Sample Count','Bottom-20:: CPC Category vs Sample Count in Training Set')

## 6.2 Testing Data

### 6.2.1 Top 20 most frequent CPC Categories

In [None]:
cpcSampleCount = test_df['context'].value_counts().to_dict()
cpcSampleCountSorted = dict(sorted(cpcSampleCount.items(), key=lambda item: item[1],reverse=True))
top20 = {}
c = 0
for k,v in cpcSampleCountSorted.items():
    if c == 20:
        break
    top20[k] = v
    c+=1
barPlot(top20.keys(),top20.values(),'CPC Category','Sample Count','TOP-20:: CPC Category vs Sample Count in Testing Set')

### 6.2.2 From Bottom:: 20 least frequent CPC Categories

In [None]:
cpcSampleCount = test_df['context'].value_counts().to_dict()
cpcSampleCountSorted = dict(sorted(cpcSampleCount.items(), key=lambda item: item[1]))
bottom20 = {}
c = 0
for k,v in cpcSampleCountSorted.items():
    if c == 20:
        break
    bottom20[k] = v
    c+=1
barPlot(bottom20.keys(),bottom20.values(),'CPC Category','Sample Count','Bottom-20:: CPC Category vs Sample Count in Testing Set')

# 7. Check how many anchor/target column values are unique or not individually 

## 7.1 Training Data

### 7.1.1 Anchor Column

In [None]:
anchorSampleCounts = train_df['anchor'].value_counts().to_dict()
anchorSampleCountsSorted = dict(sorted(anchorSampleCounts.items(), key=lambda item: item[1],reverse=True))
top20 = {}
c = 0
for k,v in anchorSampleCountsSorted.items():
    if c == 20:
        break
    top20[k] = v
    c+=1
barPlot(top20.keys(),top20.values(),'Anchor Text','Sample Count','Top 20:: Anchor Text vs Sample Count in Training Set',{'overlap':True})

In [None]:
anchorSampleCounts = train_df['anchor'].value_counts().to_dict()
anchorSampleCountsSorted = dict(sorted(anchorSampleCounts.items(), key=lambda item: item[1],))
bottom20 = {}
c = 0
for k,v in anchorSampleCountsSorted.items():
    if c == 20:
        break
    bottom20[k] = v
    c+=1
barPlot(bottom20.keys(),bottom20.values(),'Anchor Text','Sample Count','Bottom 20:: Anchor Text vs Sample Count in Training Set',{'overlap':True})

### 7.1.2 Target Column

In [None]:
targetSampleCounts = train_df['target'].value_counts().to_dict()
targetSampleCountsSorted = dict(sorted(targetSampleCounts.items(), key=lambda item: item[1],reverse=True))
top20 = {}
c = 0
for k,v in targetSampleCountsSorted.items():
    if c == 20:
        break
    top20[k] = v
    c+=1
barPlot(top20.keys(),top20.values(),'Target Text','Sample Count','Top 20:: Target Text vs Sample Count in Training Set',{'overlap':True})

In [None]:
targetSampleCounts = train_df['target'].value_counts().to_dict()
targetSampleCountsSorted = dict(sorted(anchorSampleCounts.items(), key=lambda item: item[1]))
bottom20 = {}
c = 0
for k,v in targetSampleCountsSorted.items():
    if c == 20:
        break
    bottom20[k] = v
    c+=1
barPlot(bottom20.keys(),bottom20.values(),'Target Text','Sample Count','Bottom 20:: Target Text vs Sample Count in Training Set',{'overlap':True})

## 7.2 Test Data

### 7.2.1 Anchor Column

In [None]:
anchorSampleCounts = test_df['anchor'].value_counts().to_dict()
anchorSampleCountsSorted = dict(sorted(anchorSampleCounts.items(), key=lambda item: item[1],reverse=True))
top20 = {}
c = 0
for k,v in anchorSampleCountsSorted.items():
    if c == 20:
        break
    top20[k] = v
    c+=1
barPlot(top20.keys(),top20.values(),'Anchor Text','Sample Count','Top 20:: Anchor Text vs Sample Count in Test Set',{'overlap':True})

In [None]:
anchorSampleCounts = test_df['anchor'].value_counts().to_dict()
anchorSampleCountsSorted = dict(sorted(anchorSampleCounts.items(), key=lambda item: item[1],))
bottom20 = {}
c = 0
for k,v in anchorSampleCountsSorted.items():
    if c == 20:
        break
    bottom20[k] = v
    c+=1
barPlot(bottom20.keys(),bottom20.values(),'Anchor Text','Sample Count','Bottom 20:: Anchor Text vs Sample Count in Test Set',{'overlap':True})

### 7.2.2 Target Column

In [None]:
targetSampleCounts = test_df['target'].value_counts().to_dict()
targetSampleCountsSorted = dict(sorted(targetSampleCounts.items(), key=lambda item: item[1],reverse=True))
top20 = {}
c = 0
for k,v in targetSampleCountsSorted.items():
    if c == 20:
        break
    top20[k] = v
    c+=1
barPlot(top20.keys(),top20.values(),'Target Text','Sample Count','Top 20:: Target Text vs Sample Count in Test Set',{'overlap':True})

In [None]:
targetSampleCounts = test_df['target'].value_counts().to_dict()
targetSampleCountsSorted = dict(sorted(anchorSampleCounts.items(), key=lambda item: item[1]))
bottom20 = {}
c = 0
for k,v in targetSampleCountsSorted.items():
    if c == 20:
        break
    bottom20[k] = v
    c+=1
barPlot(bottom20.keys(),bottom20.values(),'Target Text','Sample Count','Bottom 20:: Target Text vs Sample Count in Test Set',{'overlap':True})

# 8. Word cloud for anchor and target column

## 8.1 Training Data

### 8.1.1 Anchor Column

In [None]:
from wordcloud import WordCloud

text = " ".join(cat for cat in train_df['anchor'].tolist())
word_cloud = WordCloud(width=3200, height=1600,collocations = False, background_color = 'white').generate(text)
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### 8.1.2 Target Column

In [None]:
from wordcloud import WordCloud

text = " ".join(cat for cat in train_df['target'].tolist())
word_cloud = WordCloud(width=3200, height=1600,collocations = False, background_color = 'white').generate(text)
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

## 8.2 Test Data

### 8.2.1 Anchor Column

In [None]:
from wordcloud import WordCloud

text = " ".join(cat for cat in test_df['anchor'].tolist())
word_cloud = WordCloud(width=3200, height=1600,collocations = False, background_color = 'white').generate(text)
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### 8.2.2 Target Column

In [None]:
from wordcloud import WordCloud

text = " ".join(cat for cat in test_df['target'].tolist())
word_cloud = WordCloud(width=3200, height=1600,collocations = False, background_color = 'white').generate(text)
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# 9. find maximum count of targets across anchors (number of targets across a single anchor)

## 9.1 Training data

In [None]:
d = {}
for val in list(set(train_df['anchor'].tolist())):
    d[val] = train_df[train_df['anchor'] == val]['target'].tolist()

for key in d.keys():
    d[key] = list(set(d[key]))

targetAnchorSortedLength = dict(sorted(d.items(),key=lambda item: len(item[1]),reverse=True))


In [None]:
targetAnchorMaxCount = {}
c = 0
for k,v in targetAnchorSortedLength.items():
    if c == 20:
        break
    targetAnchorMaxCount[k] = len(v)
    c+=1
barPlot(targetAnchorMaxCount.keys(),targetAnchorMaxCount.values(),'Target Count','Anchor Text','Top 20:: Number of Targets across anchors in Training Data',{'overlap':True})

In [None]:
targetAnchorSortedLength = dict(sorted(d.items(),key=lambda item: len(item[1])))
targetAnchorMaxCount = {}
c = 0
for k,v in targetAnchorSortedLength.items():
    if c == 20:
        break
    targetAnchorMaxCount[k] = len(v)
    c+=1
barPlot(targetAnchorMaxCount.keys(),targetAnchorMaxCount.values(),'Target Count','Anchor Text','Bottom 20:: Number of Targets across anchors in Training Data',{'overlap':True})

## 9.2 Test Data

In [None]:
d = {}
for val in list(set(test_df['anchor'].tolist())):
    d[val] = test_df[test_df['anchor'] == val]['target'].tolist()

for key in d.keys():
    d[key] = list(set(d[key]))

targetAnchorSortedLength = dict(sorted(d.items(),key=lambda item: len(item[1]),reverse=True))


In [None]:
targetAnchorMaxCount = {}
c = 0
for k,v in targetAnchorSortedLength.items():
    if c == 20:
        break
    targetAnchorMaxCount[k] = len(v)
    c+=1
barPlot(targetAnchorMaxCount.keys(),targetAnchorMaxCount.values(),'Target Count','Anchor Text','Top 20:: Number of Targets across anchors in Test Data',{'overlap':True})

In [None]:
targetAnchorSortedLength = dict(sorted(d.items(),key=lambda item: len(item[1])))
targetAnchorMaxCount = {}
c = 0
for k,v in targetAnchorSortedLength.items():
    if c == 20:
        break
    targetAnchorMaxCount[k] = len(v)
    c+=1
barPlot(targetAnchorMaxCount.keys(),targetAnchorMaxCount.values(),'Target Count','Anchor Text','Bottom 20:: Number of Targets across anchors in Training Data',{'overlap':True})

# 10. does anchor column has values that are also present in targets

## 10.1 Training Data

In [None]:
lsAnchor = list(set(train_df['anchor'].tolist()))
lsTarget = list(set(train_df['target'].tolist()))

common = [x for x in lsAnchor if x in lsTarget]
print("Common Text between Anchor and Target for Training Data: {0}".format(len(common)))

## 10.2 Testing Data

In [None]:
lsAnchor = list(set(test_df['anchor'].tolist()))
lsTarget = list(set(test_df['target'].tolist()))

common = [x for x in lsAnchor if x in lsTarget]
print("Common Text between Anchor and Target for Testing Data: {0}".format(len(common)))