In [1]:
import sys
sys.path.append("..")
sys.path.append("../modeling/")

In [2]:
import pandas as pd                # Data processing
import matplotlib.pyplot as plt    # Data visualization purposes
import seaborn as sns              # Statistical data visualization
%matplotlib inline
sns.set_theme()

In [3]:
from modeling.database import MongoDB
db = MongoDB()

In [4]:
documents = db.get_all_documents()
df = pd.DataFrame(list(documents))

### Text cleaning

In [5]:
df['description'] = df['description'].apply(lambda x: x.lower())               # Convert to lower case
df['description'] = df['description'].str.replace(r'[^\w\s]', ' ', regex=True) # Remove special characters
df['description'] = df['description'].str.replace(r'\d+', '', regex=True)      # Remove digits
df['description'] = df['description'].str.replace(r'\s*\n+', ' ', regex=True)  # Remove new lines and whitespaces
df['description'].head()

0    we are looking for a bright machine learning e...
1    role description as a multidisciplinary team m...
2    intracom telecom is a global telecommunication...
3    the continuous development of web and mobile a...
4    about us baresquare is changing the world of a...
Name: description, dtype: object

In [6]:
#df['description'] = df['description'].apply(lambda x: [word for word in x if len(word) > 1 and word != "r"])
df['description'] = df['description'].str.replace('e g', '')
df['description'].head()

0    we are looking for a bright machine learning e...
1    role description as a multidisciplinary team m...
2    intracom telecom is a global telecommunication...
3    the continuous development of web and mobile a...
4    about us baresquare is changing the world of a...
Name: description, dtype: object

In [7]:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

In [8]:
doc = nlp(df['description'][100])
#print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print(len(doc))

734


In [9]:
noun_phrases = []
for doc in nlp.pipe(df['description'].astype('unicode').values, batch_size=50, n_process=3):
    #if doc.has_annotation("DEP"):
    assert doc.has_annotation("DEP")
    for chunk in doc.noun_chunks:
        noun_phrases.append(chunk.text)

In [10]:
nouns_phrases_set = list(set(noun_phrases))
len(nouns_phrases_set)

11802

In [11]:
import random

#nouns_phrases_set[:100]
selection = random.sample(nouns_phrases_set, 4000)

In [12]:
import numpy as np

df = pd.DataFrame(selection, columns=["phrase"])
df["type"] = np.nan

df.to_csv('noun_phrases.csv', index=False)

In [13]:
df

Unnamed: 0,phrase,type
0,initiative,
1,edge machine learning algorithms,
2,open science platform,
3,snowflake,
4,initiatives,
...,...,...
3995,advanced tools,
3996,the processing,
3997,a more equal opportunity workplace,
3998,θέση και,
