## Exploration of External Data

Feel free to fork and mess around, however, I did not find the external data to be very useful in increase competition performance.  

I only scraped from -  
**kids.frontiersin.org,**    
**en.wikibooks.org,**  
**simple.wikipedia.org**  
since they make up ~570 original sources of external text and represent a huge portion of text.

Please let me know if you manage to make the external data useful! Good luck!

[Notebook to dataset collection](https://www.kaggle.com/teeyee314/readability-url-scrape)

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
import requests
import re
import warnings
warnings.filterwarnings("ignore")

BASE_DIR = '../input/commonlitreadabilityprize'

print(os.listdir(BASE_DIR))

In [None]:
train = pd.read_csv(os.path.join(BASE_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(BASE_DIR, 'test.csv'))
external = pd.read_csv('../input/readability-url-scrape/external.csv')

I am using the jaccard score to measure similarity between scraped and competition text. Perhaps there is a more elegant way to do this.

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
external.dropna(inplace=True)

In [None]:
external

In [None]:
def remove_newline(text):
    text = text.split('\n')
    return '\n'.join(list(filter(lambda x: x != "", text)))

In [None]:
# filter out \n characters
external['external_text'] = external['external_text'].map(remove_newline)

In [None]:
external['excerpt_modified'] = external['excerpt'].apply(lambda x: ' '.join(x.split()))
external['external_text_modified'] = external['external_text'].apply(lambda x: ' '.join(x.split()))

In [None]:
def compare_columns(col1, col2, compare_func):
    result = []

    for x, y in zip(external[col1], external[col2]):
        result.append(compare_func(x, y))
        
    return result

# comparison functions
def len_diff(col1, col2):
    return abs(len(col2) - len(col1))

def word_diff(col1, col2):
    return abs(len(col2.split()) - len(col1.split()))

In [None]:
external['jaccard'] = compare_columns('excerpt_modified','external_text_modified', jaccard)
external['len_diff'] = compare_columns('excerpt_modified','external_text_modified', len_diff)
external['word_diff'] = compare_columns('excerpt_modified','external_text_modified', word_diff)

In [None]:
# Distribution of Jaccard Scores between scraped and competition texts
sns.displot(external['jaccard']);

I am using a threshold of 0.2 which is arbitrary and just based on looking at the distribution of jaccard scores above.

In [None]:
sns.displot(external[external['jaccard'] < 0.2]['target']);

In [None]:
external[external['jaccard'] < 0.2]

In [None]:
sns.displot(external[external['jaccard'] < 0.2]['len_diff']);

In [None]:
sns.displot(external[external['jaccard'] < 0.2]['word_diff']);

In [None]:
external[external['jaccard'] < 0.2]

In [None]:
stats = []

for (orig_text, ext_text) in zip(external['excerpt'], external['external_text']):
    scores = []
    for orig in orig_text.split('\n'):
        for idx, ext in enumerate(ext_text.split('\n')):
            scores.append(jaccard(orig, ext))
        
    stats.append(max(scores))
    
sns.displot(stats);

In [None]:
# Filter out texts that have Jaccard Scores > 0.5 - this value is arbitrary
filtered_external_text = []

for (orig_text, ext_text) in zip(external['excerpt'], external['external_text']):
    orig_text = orig_text.split('\n')
    ext_text = ext_text.split('\n')
    scores = []
    for orig in orig_text:
        for idx, ext in enumerate(ext_text):
            scores.append(jaccard(orig, ext))
        
    threshold = 0.5
    if(max(scores) > threshold):
        start = np.argmax(scores)
        end = start + len(orig_text)
        joined_text = '\n'.join(ext_text[:start] + ext_text[end:])
    else:
        joined_text = '\n'.join(ext_text)
    filtered_external_text.append(joined_text)

In [None]:
external['usable_external'] = filtered_external_text

In [None]:
usable = external.query('jaccard < 0.2 and usable_external != ""')

### Export usable external text

In [None]:
export = usable[['id', 'usable_external']].reset_index(drop=True)
export.to_csv('external_df.csv')
export.head()

# Using exported data

Merge the train dataframe with the external dataframe

In [None]:
merged = pd.merge(train, export, on='id', how='left')

In [None]:
merged

In [None]:
def create_external_df(train=None, sample_rate=0.1):
    train = pd.merge(train, export, on='id', how='left')
    df = train[~train['usable_external'].isnull()]
    
    size = 205 # word count in sentence
    new_text = list(map(lambda x: [x[i:i + size] for i in range(0, len(x), size)], df['usable_external'].map(lambda x: x.split())))
    new_text = list(map(lambda x: list(filter(lambda y: len(y) > 100, x)), new_text))
    df['external_text'] = new_text
    print(f'Rows of size {size}:', sum(list(map(lambda x: len(x), new_text))))
    
    data = []
    for ID, target, text in zip(df['id'], df['target'], df['external_text']):
        for chunk in text:
            data.append([ID, target, ' '.join(chunk)])
    print(f'Size of external df: {len(data)}. Sampling {round(len(data) * sample_rate)} rows')
    df = pd.DataFrame(data, columns=['id','target', 'excerpt']).sample(frac=sample_rate)
    return pd.concat([train[['id', 'target', 'excerpt']], df]).sample(frac=1)

#### How does the distribution of pseudo-labeled external data look?

In [None]:
kf = KFold(n_splits=5)
for trn_idx, val_idx in kf.split(train):
    shuffled_combined_df = create_external_df(train=train)
    sns.displot(shuffled_combined_df['target']);
    break