In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# basic display settings

#show full content in column
pd.set_option('display.max_colwidth', None)

#make wide graphs
sns.set(rc={'figure.figsize':(12,5)})

#how many rows to show 
pd.set_option('display.max_rows', 50)

# format floats
pd.options.display.float_format = '{:,.2f}'.format


In [None]:
# import data
train_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

In [None]:
print('Train shape: {}'.format(train_df.shape))
print('Test shape: {}'.format(test_df.shape))

In [None]:
train_df.head(10)

In [None]:
test_df.head()

In [None]:
train_df.info()

The only numeric column in the data is the actual value we have to predict, score.  All other provided data points are categorical.

In [None]:
# check numeric stats
train_df.describe()

In [None]:
train_df['score'].unique()

In [None]:
# check categoricals
train_df.describe(include=['object'])

In [None]:
# check distribution of scores
train_df['score'].value_counts().sort_index().plot(kind=("bar"))

In [None]:
plt.figure(figsize=(10, 6))
cols = ['anchor', 'context', 'target', 'id']
uniques = [len(train_df[col].unique()) for col in cols]
sns.set(font_scale=1.2)
ax = sns.barplot(x=cols, y=uniques)
ax.set(xlabel='Feature', ylabel='unique count', title='Number of unique values per Training feature')
for p, uniq in zip(ax.patches, uniques):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 10,
            uniq,
            ha="center") 


In [None]:
plt.figure(figsize=(10, 6))
cols = ['anchor', 'context', 'target', 'id']
uniques = [len(test_df[col].unique()) for col in cols]
sns.set(font_scale=1.2)
ax = sns.barplot(x=cols, y=uniques)
ax.set(xlabel='Feature', ylabel='unique count', title='Number of unique values per Test feature')
for p, uniq in zip(ax.patches, uniques):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height,
            uniq,
            ha="center") 


Test set is very small, likely hand picked to represet specific contexts.  The distribution is not representitive of distribution in training set.  Using properly designed local validation set will be more important than leaderboard scores.

## Anchors Check:

## Check distribution pattern for anchors

In [None]:
# check top counts per anchor
train_df['anchor'].value_counts()[:60].plot(kind=("bar"), title="Top 60 Anchors")

In [None]:
train_df['anchor'].value_counts()[-60:].plot(kind=("bar"), title="Bottom 60 Anchors")

Anchors have skewed distribution with some having over 140 examples, while a few have only 1-5.

## Context Check:

In [None]:
# check top counts per context
train_df['context'].value_counts()[:60].plot(kind=("bar"), title='Top 60 Context Counts')

In [None]:
train_df['context'].value_counts()[-60:].plot(kind=("bar"), title='Bottom 60 Context Counts')

In [None]:
max(train_df['context'].value_counts()), min(train_df['context'].value_counts())

Context distribution is even more skewed than anchors, with top having 2186 example, and bottom only 18.

## Visualize of Score Distributions by Context

Look for patterns in distribution of scores by context, in terms of absolute values.  (which score/context more represented)

In [None]:
#display all rows
pd.set_option('display.max_rows', 300)

# get green heatmap
cm = sns.light_palette("green", as_cmap=True)

# check most frequent score by context
df_cross = pd.crosstab(train_df['context'], train_df['score'])
df_cross.style.background_gradient(cmap=cm, axis=None)


Look for patterns in distribution of scores by context, in terms of percentage distribution for that context.  (each row = 100%, look at scores samples distribution)

In [None]:
# get orange heatmap
cm = sns.light_palette("orange", as_cmap=True)

df_cross_percent = df_cross.div(df_cross.sum(axis=1), axis=0)
df_cross_percent.style.background_gradient(cmap=cm, axis=1).format('{:.2f}%')

In [None]:
pd.set_option('display.max_rows', 30)

### # of Anchors per context check

In [None]:
# get anchors per context

context_anchor_count = train_df.groupby(['context']).nunique()['anchor']\
.reset_index().rename(columns={"anchor":"anchor_count"})\
.sort_values('anchor_count', ascending=False).reset_index(drop=True)

context_anchor_count


Anchors per context range from 98 (H01) to just 1.

### Get additional stats per context

In [None]:
# get count of samples per context
context_count = train_df.groupby(['context'])['id'].count()\
.reset_index().rename(columns={"id":"count_samples"})


In [None]:
# merge counts
context_stats = pd.merge(context_anchor_count, context_count, how='left', on='context')
context_stats['avg_samples_per_anchor']=context_stats['count_samples']/context_stats['anchor_count']


In [None]:
context_mean_score = train_df.groupby(['context']).mean().reset_index().rename(columns={"score":"mean_score"})
context_median_score = train_df.groupby(['context']).median().reset_index().rename(columns={"score":"median_score"})

context_stats = pd.merge(context_stats, context_mean_score, how='left', on='context')
context_stats = pd.merge(context_stats, context_median_score, how='left', on='context')

print('\n\nStats per context')

context_stats

In [None]:
context_stats.describe()

There are 106 unique contexts. On average there are 344 targets per context, with 16 anchors per context and 20 targets per anchor.   However, as seen before the dataset is heavily skeywed with some anchors and context having very few samples to learn from.

## Anchors & Context Check

Check for relationships between anchors and context, since the same anchor can be found in multiple contexts.

In [None]:
# count of anchor/context pairs:
print('Unique anchor/context combinations: {}'.format(len(
    train_df[['anchor', 'context']].drop_duplicates())))

Which anchors are found in more than one context?

In [None]:
# same anchor, different context check:
anchor_context_count = train_df.groupby(['anchor']).nunique()['context']\
.reset_index().rename(columns={"context":"context_count"})

anchor_context_count = anchor_context_count[anchor_context_count['context_count']>1]\
.sort_values('context_count', ascending=False).reset_index(drop=True)

anchor_context_count

Out of 733 anchors, 427 anchors are found in more than one context.  Highest is 'elevation view' with 10 contexts.