In [None]:
import pandas as pd
import os
import json
import re
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

path = '../input/coleridgeinitiative-show-us-the-data'

stopwords = ['ourselves', 'hers','the', 'between', 'yourself', 'but', 'again','of', 'there', 'about',
             'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some',
             'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is',
             's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below',
             'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more',
             'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to',
             'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and',
             'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what',
             'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has',
             'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom',
             't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 
             'was', 'here', 'than']

In [None]:
df_train = pd.read_csv(path + '/train.csv')  # shape (19661, 5)
for col in df_train.columns:
    print(f"{col}: {len(df_train[col].unique())}")

# Id: 14316
# pub_title: 14271
# dataset_title: 45
# dataset_label: 130
# cleaned_label: 130


df_input = pd.DataFrame(columns=['id', 'section_title', 'text', 'data_label'])
for ID in df_train['Id'].unique():
    df = pd.read_json(path + '/train/{}.json'.format(ID))

    for data_label in df_train[df_train['Id'] == ID]['dataset_label'].values:
        new_df = df[df['text'].str.contains(data_label)].copy(deep=True)
        new_df['data_label'] = data_label
        new_df['id'] = ID
        new_df.reset_index(inplace=True, drop=True)
        df_input = pd.concat([df_input, new_df], ignore_index=True, sort=False)
        df_input.reset_index(inplace=True, drop=True)


# words = df_input['data_label'].values  # numpy.ndarray of String


df_test = pd.read_csv(path + '/sample_submission.csv')
df_test_input = pd.DataFrame(columns=['id', 'section_title', 'text'])
for ID in df_test['Id'].values:
    df = pd.read_json(path + '/test/{}.json'.format(ID))
    
    df['id'] = ID
    df.reset_index(inplace=True, drop=True)
    df_test_input = pd.concat([df_test_input, df], ignore_index=True, sort=False)
    df_test_input.reset_index(inplace=True,drop=True)

df_test_input['length'] = df_test_input.text.str.len()
df_test_input = df_test_input[df_test_input.length > 0]

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split('|')
    l2 = s2.split('|')    
    intersection = len(set(l1).intersection(l2))
    union = len(l1) + len(l2) - intersection
    return float(intersection) / union

In [None]:
all_labels = set()

for label_1, label_2, label_3 in df_train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())

len(all_labels)

In [None]:
# datasets_titles = [str(x).lower() for x in df_input['data_label'].unique()]

labels = []
for index in df_test['Id']:
    tmp_df = df_test_input[df_test_input['id'] == index]
    match_text = tmp_df.text.str.cat(sep='\n').lower() + tmp_df.section_title.str.cat(sep='\n').lower()

    label = [text_cleaning(dataset_label) for dataset_label in all_labels if dataset_label in match_text]
    labels.append('|'.join(label))

submission_df = pd.read_csv(path + '/sample_submission.csv', index_col=0)
submission_df['PredictionString'] = labels
submission_df.to_csv('./submission.csv')
submission_df.to_csv('/kaggle/working/submission.csv')
print('submission complete')