In [None]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

import os
import re
import json
import glob
from collections import defaultdict, Counter
from textblob import TextBlob
from functools import partial

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

import nltk
import spacy
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
nlp.max_length = 4000000
from nltk.probability import FreqDist
from wordcloud import WordCloud, STOPWORDS

from tqdm.autonotebook import tqdm
import string

%matplotlib inline

os.listdir('/kaggle/input/coleridgeinitiative-show-us-the-data/')

# 1. Load Data

In [None]:
train_df = pd.read_csv('../input/show-us-the-datanew/new_train.csv')
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'
train_df.head()

In [None]:
sample_sub.head()

# 2. How many kinds of cleaned_labels?

130 cleaned_labels

In [None]:
existing_set = Counter(list(train_df['cleaned_label']))
sorted_set = sorted(existing_set.items(), key=lambda x: x[1], reverse=True)
total_set = 0
for name, num in sorted_set:
#     print("{}: {}".format(name, num))
    total_set += num
print(total_set)
mostcommon_set = sorted_set[:10]
x, y = zip(*mostcommon_set)
plt.figure(figsize=(50,30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel('Datasets', fontsize=50)
plt.ylabel('Frequency of Datasets', fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.tight_layout(pad=0)
plt.title('Freq of 10 Most Common Datasets in cleaned_label', fontsize=60)
plt.show()
# existing_set = set(train_df['cleaned_label'])
# for item in existing_set:
#     print("the {} has found {}".format(item, list(train_df['cleaned_label']).count(item)))
existing_labels = train_df['cleaned_label'].unique()
print(len(existing_labels))
# print(existing_labels)

In [None]:
def clean_text(txt, nlp=nlp):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
def clean_text_advanced(txt, nlp=nlp):
    target_size = 5
    curr_size = 0
    lemma_sentence = []
    cleaned_txt = re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()
    word_list = list(set(cleaned_txt.split()))
    word_set = Counter(word_list)
    sorted_set = sorted(word_set.items(), key=lambda x: x[1], reverse=True)
    for name, num in sorted_set:
        name = nlp(name)
        if name[0].is_stop or name[0].is_digit:
            continue
        else:
            lemma_sentence.append(name[0].lemma_)
            curr_size += 1
        if curr_size == target_size:
            return ' '.join(lemma_sentence)
    return ' '.join(lemma_sentence)
#     word_dict = dict()
    # print(word_set)
#     for name, num in word_set.items():
#         name = nlp(name)
#         if not name[0].is_stop:
#             name = name[0].lemma_
#         else:
#             continue
#         if word_dict.get(name) is None:
#             word_dict[name] = num
#         else:
#             word_dict[name] += num
#     sorted_set = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
#     return sorted_set

# Example of clean_text effect

In [None]:
# print(clean_text_advanced("I am a boy. You are a Girl dataset datasets has have."))
# print(clean_text("I am a boy. You are a Girl dataset."))

In [None]:
def read_append_return(filename, train_files_path=train_files_path, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = [ ]
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
# %%time
# tqdm.pandas()
# train_df['text'] = train_df['Id'].progress_apply(read_append_return)
# train_df['most_common'] = train_df['text'].progress_apply(clean_text_advanced)

# train_df.to_csv('new_train.csv', index=False)

In [None]:
train_df.head()

In [None]:
%%time
tqdm.pandas()
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(read_append_return, train_files_path=test_files_path))
sample_sub['most_common'] = sample_sub['text'].progress_apply(clean_text_advanced)

In [None]:
sample_sub.head()

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
temp_1 = [x.lower() for x in train_df['dataset_label'].unique()]
temp_2 = [x.lower() for x in train_df['dataset_title'].unique()]
temp_3 = [x.lower() for x in train_df['cleaned_label'].unique()]

existing_labels = set(temp_1 + temp_2 + temp_3)
print(len(existing_labels))
id_list = []
lables_list = []
for index, row in tqdm(sample_sub.iterrows()):
    sample_text = row['text']
    sample_most_common = row['most_common']
    row_id = row['Id']
    # print(train_df['text'] == text_cleaning(sample_text))
    sample_text = clean_text(sample_text)
    cleaned_labels = []
    temp_df = train_df[train_df['most_common'].progress_apply(partial(jaccard, str2=sample_most_common)) > 0.1]
    cleaned_labels = temp_df['cleaned_label'].to_list()
    print(len(set(cleaned_labels)))
    print(set(cleaned_labels))
#     for known_label in existing_labels:
#         if known_label in sample_text.lower():
#             print("matching: {}".format(known_label))
#             cleaned_labels.append(clean_text(known_label))
#     print(set(cleaned_labels))
    cleaned_labels = set(cleaned_labels)
    lables_list.append('|'.join(cleaned_labels))
    id_list.append(row_id)

In [None]:
submission = pd.DataFrame()
submission['Id'] = id_list
submission['PredictionString'] = lables_list

In [None]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)