<h1 style='background:teal; color:white; padding:20px;'>
Coleridge: Words in the titles</h1>

ref [Coleridge competition @ Kaggle](https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data)

## purpose
illustrate search for frequent words in dataset titles

In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # full screen width of Jupyter notebook
pd.options.display.max_rows, pd.options.display.max_columns = 500, 100

from collections import Counter
from nltk.corpus import stopwords
eng_stopwords = set(stopwords.words('english'))  # 179 englist stopwords

data_path = '../input/coleridgeinitiative-show-us-the-data/'
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
train_df.shape

In [None]:
def titles_to_word_sets(titles_series):
    # cleanup characters
    titles_words = titles_series.str.lower().replace(r'[^a-z ]+','', regex=True).unique()
    # create list of sets of word for each dataset label
    titles_words = [set(t_words.split()) for t_words in titles_words]
    # remove stopwords
    titles_words = [t_words.difference(eng_stopwords) for t_words in titles_words]
    return titles_words

def count_occurences (list_of_word_sets):
    cnt = Counter ([w for word_set in list_of_word_sets for w in word_set])
    return {k: v for k, v in sorted(cnt.items(), key=lambda item: item[1], reverse=True)}

titles_words = titles_to_word_sets(train_df.cleaned_label)
occurencies = count_occurences(titles_words)
print("Top 10 words by frequency:")
list(occurencies.items())[:10]

In [None]:
def find_coverage_words(words_sets, min_occurencies=3):
    """ finds a few words that covers most word sets provided
    set min_occurencies to 1 to cover all words set.
    returns dict with covering words and number of words_sets excluded at iteration 
    """
    coverage_words = {}
    while len(words_sets) > 0:
        occur_dict = count_occurences(words_sets)
        next_word, next_word_count = list(occur_dict.items())[0]
        if next_word_count < min_occurencies:
            break
        words_sets = [word_set for word_set in words_sets if next_word not in word_set]
        coverage_words[next_word] = next_word_count
    coverage_words['_REMAINING_'] = len(words_sets)
    return coverage_words, len(words_sets)

coverage_words, n_remaining = find_coverage_words(titles_words)
print(f"Analyzing words from {len(titles_words)} titles; found {len(coverage_words)} words")
coverage_words

In [None]:
# same test with most frequent words
top_n = 11
frequent_set = set(list(occurencies.keys())[:top_n])
len([tws for tws in titles_words if set(tws).intersection(frequent_set) != set()])

**Observations:**
- With **greedy search** for coverage: at leaset 1 of 11 words are present in **88.5%** of the dataset titles. (115 of 130).

- When taking same number of the **most frequent words**, we observe that only **74.5%** of titles (97 of 130) contain at least one of the 11 most frequent words.