In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import string
import re
from pickle import dump, load
import json
import time
from nltk.corpus import stopwords

In [None]:
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences

from numpy import asarray
from numpy import zeros

# Merging ``train.csv`` with the rights articles

In [None]:
train = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')

The following function will be used to perform compile all paragraphs from a given article (using the ``Id`` column from the ``train.csv`` file) and then concatenate it with our ``train`` dataframe (``train.csv``). 

In [None]:
def merge(filename, test=False):
    if test:
        df = pd.read_json(f'../input/coleridgeinitiative-show-us-the-data/test/{filename}.json')
    else:
        df = pd.read_json(f'../input/coleridgeinitiative-show-us-the-data/train/{filename}.json')
    text = " ".join(list(df['text']))
    return text

Now we apply this function to our ``train.csv`` file to concatenate all our training data.

In [None]:
start = time.time()
train['text'] = train['Id'].apply(merge)
end = time.time()
print(f'This cell executes in {end - start:.2f} seconds.') # usually 1.5mn - but sometimes longer

In [None]:
train.head()

# Preprocessing

Then we proceed to perform all our preprocessing steps:
* lowercasing all text
* removing all punctuation
* removing stopwords
* removing nonalphabetic tokens
* removing words shorter than one character

In [None]:
# lowercasing the text
train['text'] = train['text'].str.lower()

In [None]:
# removing punctuation
re_punc = re.compile('[%s]' % re.escape(string.punctuation))

def remove_punctuation(text):
    return ' '.join([re_punc.sub('', word) for word in str(text).split()])

train['text'] = train['text'].apply(lambda text: remove_punctuation(text))

In [None]:
# removing stopwords
stop_words = list(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in str(text).split() if word not in stop_words])

train['text'] = train['text'].apply(lambda text: remove_stopwords(text))

In [None]:
# removing nonalphabetic tokens
def remove_nonalpha(text):
    return ' '.join([word for word in str(text).split() if word.isalpha()])

train['text'] = train['text'].apply(lambda text: remove_nonalpha(text))

In [None]:
# removing short words
def remove_short(text):
    return ' '.join([word for word in str(text).split() if len(word) > 1])

train['text'] = train['text'].apply(lambda text: remove_short(text))
train.head()

# String matching and how to make predictions

We first get the list of the test files' names (we should have 4 of them). We will then create a similar dataframe to ``train.csv`` with only an ``Id`` column and the article's text.

In [None]:
test_filenames = os.listdir('../input/coleridgeinitiative-show-us-the-data/test')

In [None]:
# turn it into a dataframe: let's first get the Id column
test = pd.DataFrame({'Id':test_filenames})
test['Id'] = test['Id'].apply(lambda x : x.split('.')[0])

# we now add the text column (with the article text)
test['text'] = test['Id'].apply(merge, test=True)

In [None]:
test

Now we must clean this text. But an easy way to have a first submission would be to search within the text column if we find dataset names that are already in our train set. We will perform that by keeping only words in the ``train[dataset_label]`` and ``train[dataset_title]`` columns (we refer to columns not cleaned because we have not cleaned the test articles' text).

We first extract the set of words from these 2 columns and define how we will clean our text (we keep alphanumeric characters, split the text into tokens, lowercase it and strip trailing and leading spaces). The set of title words consists of the union of the sets of (unique) words from the ``train[dataset_label]`` and ``train[dataset_title]`` columns, that we then lowercase.

In [None]:
title_words = [x.lower() for x in set(train['dataset_title'].unique()).union(set(train['dataset_label'].unique()))]

def clean_articles(text):
    return re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()

Then we load the submission template to fill.

In [None]:
submission = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/sample_submission.csv")

Now we loop over the 4 rows of the submission dataframe to perform our string matching operation. More precisely, we get the 4 rows to concatenate the text of each article, and then we filter this text based on our ``title_words`` list of words. Finally we build a string a title words separated by ``|``.

In [None]:
article_titles = []

for idx in submission['Id']:
    article_text = test[test['Id'] == idx].text.str.cat(sep = '\n').lower()
    label = []
    
    for data_title in title_words:
        if data_title in article_text:
            label.append(clean_articles(data_title))
            
    article_titles.append("|".join(label))

Let's add this to our submission dataframe and check it.

In [None]:
submission['PredictionString'] = article_titles
submission

Finally we save our submission.

In [None]:
submission.to_csv('submission.csv', index = False)

Thanks for reading!

This notebook was inspired by the following resources:
* https://neptune.ai/blog/exploratory-data-analysis-natural-language-processing-tools
* https://www.kaggle.com/anthokalel/coleridge-complete-eda
* https://www.kaggle.com/ajaypawar123/eda-text-processing-string-matching-beginners/notebook#String-Matching