In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import json
import os
import re

import numpy
import pandas
from fuzzywuzzy import fuzz
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [None]:
directory = r"../input/coleridgeinitiative-show-us-the-data/"

train_csv = pandas.read_csv(directory + "/train.csv")
sample_submission = pandas.read_csv(directory + "/sample_submission.csv")

# Retrieve json data and clean it

In [None]:
def retrieve_text(filename, type):
    json_path = os.path.join(directory, type, filename + ".json")

    section_title = []
    contents = []
    with open(json_path, mode='r') as recurse:
        json_contents = json.load(recurse)

        for data in json_contents:
            contents.append(data.get('section_title'))
            contents.append(data.get('text'))

        # section_title = data_cleaning(" ".join(section_title))
        contents = data_cleaning(" ".join(contents))

    return contents

In [None]:
def data_cleaning(text):
    text = re.sub('[^A-Za-z0-9]+', " ", text)
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    cleaned_text = emoji_pattern.sub(r'', text)

    return cleaned_text.lower()

In [None]:
def load_json():
    train_csv['json-content'] = train_csv['Id'].apply(retrieve_text, args=('train',))
    test_set['json-content'] = sample_submission['Id'].apply(retrieve_text, args=('test',))
    # train_csv['acronym'] = train_csv['dataset_title'].progress_apply(create_patterns)
    # train_csv['fuzzy-ratio'] = train_csv.progress_apply(get_fuzzy_score, axis=1)

In [None]:
test_set = pandas.DataFrame()
test_set['Id'] = sample_submission['Id']
load_json()

# Preprocess data - retrive useful info

In [None]:
def preprocess_data(dataframe):
    unique_dataset_titles = dataframe['dataset_title'].unique()

    for dataset_title in unique_dataset_titles:
        try:
            if '(' in str(dataset_title):
                tmp_title = str(dataset_title).split(" ")
                
                tmp_title_without_braces = str(dataset_title).replace("(", "")
                tmp_title_without_braces = tmp_title_without_braces.replace(")", "").lower()
                tmp_title_without_braces = re.sub('[^A-Za-z]+', " ", tmp_title_without_braces)
                    
                for word in tmp_title:
                    if '(' in word:
                        acronyms_dict[str(word[1: -1]).lower()] = tmp_title_without_braces

            else:
                text = re.sub('[^A-Za-z]+', " ", str(dataset_title))
                clean_text = text.lower().split()
                clean_text = [clean_word for clean_word in clean_text if not clean_word in set(stop_words)]

                acronym_text = []
                for word in clean_text:
                    acronym_text.append(word[0: 1])

                acronyms_dict["".join(acronym_text)] = str(dataset_title).lower()

            tmp_title = str(dataset_title)
            tmp_title_without_braces = str(dataset_title).lower().split(" ")
            tmp_title = re.sub('[^A-Za-z0-9]+', " ", tmp_title).lower()
            tmp_title_without_braces = [word for word in tmp_title_without_braces if not '(' in word]
            tmp_title_without_braces = re.sub('[^A-Za-z0-9]+', " ", str(tmp_title_without_braces)).lower()

            titles_prior1.add(tmp_title.strip())
            
            if tmp_title_without_braces.strip() not in titles_prior1:
                titles_prior2.add(tmp_title_without_braces.strip())
                titles_dict[tmp_title_without_braces.strip()] = tmp_title.strip()

        except:
            print("exception occurred for title: ", dataset_title)
            continue

    return acronyms_dict, titles_dict, titles_prior1, titles_prior2

In [None]:
stop_words = stopwords.words('english')
acronyms = set()
titles_prior1 = set()
titles_prior2 = set()
acronyms_dict = {}
titles_dict = {}
acronyms_dict, titles_dict, titles_prior1, titles_prior2 = preprocess_data(train_csv)

In [None]:
acronyms_dict

In [None]:
titles_prior1 = list(sorted(titles_prior1, key=len, reverse=True))
titles_prior2 = list(sorted(titles_prior2, key=len, reverse=True))
unique_cleaned_matches = train_csv['cleaned_label'].unique()

In [None]:
print(titles_prior1)

# Predict results and save it to submission file

In [None]:
acronyms = acronyms_dict.keys()
match_out = []
for json_data in test_set['json-content']:
    match = ''
    tmp_set = set()

    for word in json_data.split():
        tmp_set.add(word)
    
    for clean_text in unique_cleaned_matches:
        if clean_text in str(json_data) and clean_text not in match:
            match += ('|' + clean_text if len(match) > 0 else clean_text)
            
    for query_prior1 in titles_prior1:
        query_text = str(query_prior1).lower()

        if query_text in str(json_data) and query_text not in match:
            match += ('|' + query_text if len(match) > 0 else query_text)

    for query_prior2 in titles_prior2:
        query_text = str(query_prior2).lower()

        if query_text in str(json_data) and query_text not in match:
            match += ('|' + query_text if len(match) > 0 else query_text)

    for query_text in acronyms:
        if len(query_text) > 3 and query_text in tmp_set and query_text not in match:
            match += ('|' + query_text if len(match) > 0 else query_text)

    match_out.append(match)

In [None]:
# print(match_out)

In [None]:
result = pandas.DataFrame()
result['Id'] = test_set['Id']
result['PredictionString'] = match_out
result.to_csv('submission.csv', index=False)

In [None]:
# result = pandas.DataFrame()
# result['Id'] = train_csv['Id']
# result['title'] = train_csv['dataset_title']
# result['clean'] = train_csv['cleaned_label']
# result['PredictionString'] = match_out
# result.to_csv('submission.csv', index=False)