In [None]:
import os
import re
import json
import pickle
from collections import defaultdict, Counter
import gc

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, plot_confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
tqdm.pandas()

%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # full screen width of Jupyter notebook
pd.options.display.max_rows, pd.options.display.max_columns = 500, 100

# NLP imports
import nltk

# Neural network imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
print( 'tf version:', tf.__version__)

In [None]:
%%time
""" Loading data"""
data_path = '../input/coleridgeinitiative-show-us-the-data/'

def read_json_from_folder(folder_name):
    json_dict = {}
    for filename in os.listdir(folder_name):
        with open(os.path.join(folder_name, filename)) as f:
            json_dict[filename[:-5]] = json.load(f)
    return json_dict

# train_dict = read_json_from_folder(os.path.join(data_path, 'train'))
# test_dict = read_json_from_folder(os.path.join(data_path, 'test'))
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
sample_sub = pd.read_csv(os.path.join(data_path,'sample_submission.csv'))
    
# len(train_dict), len(test_dict), 
train_df.shape, sample_sub.shape

In [None]:
# ext dataset 1
adnl_govt_labels = pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')
adnl_govt_labels.sample(5)

In [None]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
def read_json_pub(filename, train_data_path=paper_train_folder, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
train_df['ext_cleaned_label'] = ''

to_append = []
for index, row in tqdm(train_df.iterrows(), total = train_df.shape[0]):
    to_append = [row['Id'], []]
    large_string = str(read_json_pub(row['Id']))
    clean_string = text_cleaning(large_string)
    for index, row2 in adnl_govt_labels.iterrows():
        query_string = str(row2['title'])
        if query_string in clean_string:
            to_append[1].append(clean_text(query_string))
    
#     literal_preds.append(to_append[1])
    train_df.loc[train_df['Id']==row['Id'], 'ext_cleaned_label'] = '|'.join(np.unique(to_append[1]))

In [None]:
train_df.to_csv('./train_df.csv', index=False)

In [None]:
pd.read_csv('./train_df.csv').head()