## Efficient Keyword Matching
'in' method can help in seaching from a few thounsand dataset names, but if you are having a dataset name list consisting of Millions of names(like [this](https://www.kaggle.com/googleai/dataset-search-metadata-for-datasets)), then it is not going to help.<br><br>
This kernel does that by - 
+ Splits paragraph into words
+ Creates sets of dataset names with different word count
+ Then picks ‘n’ consecutive words, joins them, and searches in the set<br>

In [None]:
import matplotlib.pyplot as plt
img = plt.imread('../input/kaggle-images/matching.png')
plt.figure(figsize=(15, 5))
plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
import re
import gc
import os
import json
import pandas as pd
from tqdm.auto import tqdm

tqdm.pandas()

## Utility functions

In [None]:
test_files_path = '../input/coleridgeinitiative-show-us-the-data/train'

def num_words(x):
    try:
        return len(x.split())
    except:
        return 0
    
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

def read_json_pub(filename, train_data_path=test_files_path, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

## Get all the dataset names and their word count in a dataframe

In [None]:
df1 = pd.read_csv('../input/dataset-search-metadata-for-datasets/dataset_metadata_2020_10_16.csv')[['name']]
df1['length'] = df1.name.progress_apply(num_words)

gc.collect()

df2 = pd.read_csv('../input/dataset-search-metadata-for-datasets/dataset_metadata_2020_08_17.csv/dataset_metadata_2020_08_17.csv')[['name']]
df2['length'] = df2.name.progress_apply(num_words)

gc.collect()

df = pd.concat([df1, df2]).reset_index(drop=True)

del df1, df2
gc.collect()

## Sets based on number of words in the dataset name

In [None]:
dict_of_set_of_datasets = {}

for i in range(1, 25):
    dict_of_set_of_datasets[i] = set(df[df.length==i].name.values)

# Keyword matching

In [None]:
match_predictions = pd.DataFrame(index=[i.replace('.json', '') for i in os.listdir('../input/coleridgeinitiative-show-us-the-data/train')],
                                 columns = ['PredictionString'])

for id_, row in tqdm(match_predictions.iterrows(), total=len(match_predictions)):
    
    predictions = []
    
    large_string = str(read_json_pub(id_, test_files_path))
    text_list = clean_text(large_string).split()
    
    for indx, words in enumerate(zip(*[text_list[s:] + ['random_text']*s for s in range(26)])):
        for j in range(1, 25):
            if ' '.join(words[:j]) in dict_of_set_of_datasets[j]:
                predictions.append(' '.join(words[:j]))
                break
                
    predictions = '|'.join(predictions)
    if predictions!='':
        match_predictions.loc[id_,'PredictionString'] = predictions

match_predictions.to_csv('submission.csv')