In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
import re
import json
import string
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from tqdm.autonotebook import tqdm
from functools import partial
from wordcloud import WordCloud, STOPWORDS
import nltk
import spacy
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
nlp.max_length = 4000000
from nltk.probability import FreqDist

from tqdm import tqdm
tqdm.pandas()

from nltk.corpus import stopwords
from unidecode import unidecode

STOPWORDS = set(stopwords.words('english'))

In [None]:
train = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
train.head()

In [None]:
train.columns

In [None]:
for col in train.columns:
    print(col + ":" + str(len(train[col].unique())))

In [None]:
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
sample_sub.head()

In [None]:
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [None]:
def json_to_text(filename, train_files_path=train_files_path, output='text'):
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
tqdm.pandas()
train['text'] = train['Id'].progress_apply(json_to_text)

In [None]:
train.head()

In [None]:
tqdm.pandas()
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(json_to_text, train_files_path=test_files_path))

In [None]:
sample_sub.head(10)

In [None]:
def text_cleaning(text):
    text = ''.join([k for k in text if k not in string.punctuation])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [None]:
tqdm.pandas()
train['text'] = train['text'].progress_apply(text_cleaning)

In [None]:
# Evaluate it using the metric that they use in this dataset
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
import itertools    
import collections

max=1500

L_word=[]
for i in range(max):
    L_word+=list(train.text[i].split())
print(len(L_word))

L_new=[]
for x in L_word:
    if x not in STOPWORDS:
        if len(x)>2:
            L_new.append(x)
print(len(L_new))

words=[L_new]
all_words = list(itertools.chain(*words))

counts_words = collections.Counter(all_words)

counts_words.most_common(15)

In [None]:
max_len=19661
L_labels=[]
for i in range(max_len):
    L_labels+=list(train.dataset_label[i].split())
print(len(L_labels))

L_new_lab=[]
for x in L_labels:
    if x not in STOPWORDS:
        if len(x)>2:
            L_new_lab.append(x)
print(len(L_new_lab))

labels=[L_new_lab]

counts_labels = collections.Counter(list(itertools.chain(*labels)))

len(counts_labels.most_common())

In [None]:
def extract(L):
    most=[]
    for x in L:
        most.append(x[0])
    return most

maxi=2000

most_frequent=extract(counts_labels.most_common(maxi))+extract(counts_words.most_common(maxi))
print(len(counts_labels.most_common(maxi)))
print(len(counts_words.most_common(maxi)))
print(len(most_frequent))

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


L_sub=[]
for i in range(len(sample_sub.text)):
    L_sub+=list((sample_sub.text[i].split()))
print(len(L_sub))

L_new_sub=[]
for x in L_sub:
    if x not in STOPWORDS:
        if len(x)>2:
            L_new_sub.append(clean_text(x))
print(len(L_new_sub))

wordd=[L_new_sub]
all_sub = list(itertools.chain(*wordd))

counts_sub = collections.Counter(all_sub)

most=extract(counts_sub.most_common(maxi))

counts_sub.most_common(15)

In [None]:
temp_1 = [x.lower() for x in train['dataset_label'].unique()]
temp_2 = [x.lower() for x in train['dataset_title'].unique()]
temp_3 = [x.lower() for x in train['cleaned_label'].unique()]
existing_labels = set(temp_1 + temp_2 + temp_3)

def predict(sample_sub):
    id_list = []
    lables_list = []
    for index, row in tqdm(sample_sub.iterrows()):
        sample_text = row['text']
        row_id = row['Id']
        temp_df = train[train['text'] == text_cleaning(sample_text)]
        cleaned_labels = temp_df['cleaned_label'].to_list()
        for known_label in existing_labels:
            if known_label in sample_text.lower():
                if known_label not in STOPWORDS:
                    if len(known_label)>1:
                        if known_label in most:
                            cleaned_labels.append(clean_text(known_label))
        cleaned_labels = [clean_text(x) for x in cleaned_labels]
        cleaned_labels = set(cleaned_labels)
        lables_list.append('|'.join(cleaned_labels))
        id_list.append(row_id)
    return (id_list,lables_list)

In [None]:
Z=predict(sample_sub)
submission = pd.DataFrame()
submission['Id'] = Z[0]
submission['PredictionString'] = Z[1]
submission

In [None]:
X=predict(train.head(1000))[1]
score=[]

for i in range(len(X)):
    score.append(jaccard(X[i], train.dataset_label[i]))
print(f'Score is : {np.mean(score)}')

In [None]:
submission.to_csv(r'submission.csv',index=None)