In [None]:
from tqdm.notebook import tqdm
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from functools import partial

import pandas as pd
import numpy as np

import json
import re

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

In [None]:
def get_raw_txt(_id, path="train"):
    _d = json.loads(open(f"{PATH}/{path}/{_id}.json").read())
    return " ".join([i["text"] for i in _d])

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

# Train set

In [None]:
PATH = "../input/coleridgeinitiative-show-us-the-data/"
tqdm_params = dict(bar_format='{desc}{bar} [ {n} / {total} (remaining: {remaining}) ]', colour="green")

tqdm.pandas(desc="Text download from JSON files status:",**tqdm_params)
df_train = pd.read_csv(f"{PATH}train.csv")
df_train['text'] = df_train['Id'].progress_apply(get_raw_txt)

word_model = df_train[['dataset_label','dataset_title','cleaned_label']]
word_model = word_model.values.reshape((len(df_train)*3,))
word_model = list(np.unique(word_model))
print("Word_model created : contains all raw labels (Need to be cleaned before the next steps)")

for i in tqdm(range(len(word_model)),desc="Word model cleaning using clean_text:",**tqdm_params):
    word_model[i] = clean_text(word_model[i])
word_model = {i.strip() for i in set(word_model)}

print(f"Word_model cleaned : now we have the whole list of labels from train set\n")
print(f"Word_model unique values : \033[1m{len(word_model)}")

wordcloud = WordCloud(max_font_size=15,background_color="white",height=100,scale=3,colormap="Greens")
wordcloud = wordcloud.generate(" ".join(word_model))
plt.figure(figsize=(25,25))
plt.title("World_model top words in labels\n")
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

# Submission set

In [None]:
tqdm_params = dict(bar_format='{desc}{bar} [ {n} / {total} (remaining: {remaining}) ]', colour="darkgreen")

tqdm.pandas(desc="Text download from JSON files status :",**tqdm_params)
submission = pd.read_csv(f"{PATH}sample_submission.csv")
submission['text'] = submission['Id'].progress_apply(partial(get_raw_txt,path="test"))

tqdm.pandas(desc="Text cleaning with clean_text function :",**tqdm_params)
submission['text'] = submission['text'].progress_apply(clean_text)

submission['PredictionString'] = ''
for i in submission.index:
    for ws in word_model:
        if ws in submission.loc[i]["text"]:
            submission.loc[i]['PredictionString'] += ws.strip() + '|'
    submission.loc[i]['PredictionString'] = submission.loc[i]['PredictionString'][:-1]

submission = submission.drop("text",axis=1)
submission.to_csv("submission.csv", index = False)
submission