# Training Dataset Creation

## Dataset Loading

In [1]:
import sys
sys.path.append("..")
sys.path.append("../modeling/")

In [2]:
import pandas as pd                # Data processing
import matplotlib.pyplot as plt    # Data visualization purposes
import seaborn as sns              # Statistical data visualization
sns.set_theme()                    # Seaborn will handle plotting theme
%matplotlib inline

In [3]:
from modeling.database import MongoDB
db = MongoDB()
df = pd.DataFrame(list(db.get_all_documents()))

## Text cleaning

The following code creates a dictionary that maps each Greek character to `None` using the `dict.fromkeys()` method.

Using the `map()` function, it converts each Greek character to its Unicode ordinal value by passing this list of values to the `ord()` function.

Finally, it uses the resulting dictionary with the `translate()` method to remove all Greek characters from the input string.

In [4]:
gr_chars = "ΑαΆάΒβΓγΔδΕεΈέΖζΗηΉήΘθΙιΊίΚκΛλΜμΝνΞξΟοΌόΠπΡρΣσςΤτΥυΎύΦφΧχΨψΩωΏώ"

def remove_greek(desc):
    translation_table = dict.fromkeys(map(ord, gr_chars), None)
    return desc.translate(translation_table)

# Remove Greek words as the ads were meant for Greece
df['description'] = df['description'].apply(lambda x: remove_greek(x))

In [5]:
# Use regex to remove unwanted formating
replacements = {
    'e.g.': '', 'i.e.': '',                                           # Remove usual examples
    r'\b[\w\.-]+@[\w\.-]+\.\w{2,6}\b': '',                            # Remove e-mails
    r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)': '',  # Remove links
    r'\B#([a-z0-9]{1,})': '',                                         # Remove hashtags
    r'[^\w\s+#]': ' ',                                                # Remove special characters
    r'(\s{2,})|(\n+)': ' '                                            # Remove new lines and whitespaces
}

df['description'] = df['description'].replace(replacements, regex=True)

## Noun Chunk Extraction

I will use spaCy’s `nlp.pipe()` method to process the "description" column in batches of 50 and with 6 processes.

Then, we iterate over each noun chunk in each document and append it to the `noun_phrases` list.

In [6]:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

In [7]:
noun_phrases = []
for doc in nlp.pipe(df['description'].astype('unicode').values, batch_size=50, n_process=6):
    assert doc.has_annotation("DEP")
    for chunk in doc.noun_chunks:
        noun_phrases.append(chunk.text)

In [8]:
nouns_phrases_set = list(set(noun_phrases))
print("Extracted", len(nouns_phrases_set), "noun phrases.")

Extracted 9603 noun phrases.


## Training Set Selection

The following code will randomly sample 4000 records and export it in a csv file.

In [9]:
import random

random.seed(10)
selection = random.sample(nouns_phrases_set, 4000)

df_train = pd.DataFrame(selection, columns=["chunks"])
df_train["type"] = None

df_train.to_csv('noun_chunks.csv', index=False)