# Training Dataset Creation

In [1]:
import sys
sys.path.append("..")
sys.path.append("../modeling/")

In [2]:
import pandas as pd                # Data processing
import matplotlib.pyplot as plt    # Data visualization purposes
import seaborn as sns              # Statistical data visualization
sns.set_theme()                    # Seaborn will handle plotting theme
%matplotlib inline

In [3]:
from modeling.database import MongoDB
db = MongoDB()
df = pd.DataFrame(list(db.get_all_documents()))

## Text cleaning

In [4]:
gr_chars = "ΑαΆάΒβΓγΔδΕεΈέΖζΗηΉήΘθΙιΊίΚκΛλΜμΝνΞξΟοΌόΠπΡρΣσςΤτΥυΎύΦφΧχΨψΩωΏώ"

def remove_greek(desc):
    for char in gr_chars:
        if char in desc:
            desc = desc.replace(char, '')
    return desc

In [5]:
# Remove Greek words as the ads were meant for Greece
df['description'] = df['description'].apply(lambda x: remove_greek(x))

# Remove usual examples
df['description'] = df['description'].str.replace('e.g.', '', regex=False)
df['description'] = df['description'].str.replace('i.e.', '', regex=False)

# Remove e-mails
df['description'] = df['description'].str.replace(r'\b[\w\.-]+@[\w\.-]+\.\w{2,6}\b', '', regex=True)

# Remove links
df['description'] = df['description'].str.replace(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', '', regex=True)

# Remove hashtags
df['description'] = df['description'].str.replace(r'\B#([a-z0-9]{1,})', '', regex=True)

# Rest
df['description'] = df['description'].str.replace(r'[^\w\s+#]', ' ', regex=True)        # Remove special characters
df['description'] = df['description'].str.replace(r'(\s{2,})|(\n+)', ' ', regex=True)   # Remove new lines and whitespaces

## Noun Chunk Extraction

In [6]:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

In [7]:
doc = nlp(df['description'][100])
#print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print(len(doc))

668


In [8]:
noun_phrases = []
for doc in nlp.pipe(df['description'].astype('unicode').values, batch_size=50, n_process=6):
    assert doc.has_annotation("DEP")
    for chunk in doc.noun_chunks:
        noun_phrases.append(chunk.text)

In [9]:
nouns_phrases_set = list(set(noun_phrases))
len(nouns_phrases_set)

12081

## Training Set Selection

In [10]:
import random

random.seed(10)
selection = random.sample(nouns_phrases_set, 4000)

df_train = pd.DataFrame(selection, columns=["chunks"])
df_train["type"] = None

df_train.to_csv('noun_chunks.csv', index=False)