# Import libraries

In [0]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import spacy
from dframcy import DframCy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")



# Read recipe inputs

In [0]:
corpus_for_knowledge_engineering = dataiku.Dataset("corpus_for_knowledge_engineering")
corpus_for_knowledge_engineering_df = corpus_for_knowledge_engineering.get_dataframe()

In [0]:
corpus_for_knowledge_engineering_df["lower_text"] = corpus_for_knowledge_engineering_df["text"].str.lower()
corpus_for_knowledge_engineering_df["upper_text"] = corpus_for_knowledge_engineering_df["text"].str.upper()

In [0]:
corpus_for_knowledge_engineering_df

In [0]:

dframcy = DframCy(nlp)
doc = dframcy.nlp(u'Apple is looking at buying U.K. startup for $1 billion')
pos_df = dframcy.to_dataframe(doc)
tags = ["NN", "NNP"]
pos_df[pos_df["token_tag_"].isin(tags)]

In [0]:
text = corpus_for_knowledge_engineering_df["text"][0]
doc = nlp(text)

In [0]:
nouns = [(token.text, token.pos_) for token in doc if token.pos_ == "NOUN"]
pd.DataFrame(nouns, columns=['word', 'tag'])

In [0]:
corpus_for_knowledge_engineering_df[0:2]

# Extract nouns

In [0]:
# Define the function to extract nouns and return a DataFrame
def extract_nouns(doc_id, text):
    # Increase the max_length limit
    nlp.max_length = 3000000  # Set this to a value higher than your text length
    doc = nlp(text)
    nouns = [(doc_id, token.text, token.pos_) for token in doc if token.pos_ == "NOUN"]
    return pd.DataFrame(nouns, columns=['doc_id', 'word', 'tag'])

In [0]:
# Apply the function to each row and concatenate the results
pos_df = pd.concat([extract_nouns(row['doc_id'], row['text']) for _, row in corpus_for_knowledge_engineering_df.iterrows()], ignore_index=True)

In [0]:
pos_df["string"] = pos_df["word"].str.lower()
pos_df = pos_df.drop_duplicates()

In [0]:
pos_df

## Lower case

In [0]:
# Apply the function to each row and concatenate the results
lower_pos_df = pd.concat([extract_nouns(row['doc_id'], row['lower_text']) for _, row in corpus_for_knowledge_engineering_df.iterrows()], ignore_index=True)

In [0]:
lower_pos_df["string"] = lower_pos_df["word"].str.lower()
lower_pos_df.rename(columns={'tag':'lower_tag'}, inplace=True)
lower_pos_df.drop(columns=['word'], inplace=True)
lower_pos_df = lower_pos_df.drop_duplicates()

In [0]:
lower_pos_df

In [0]:
merged_pos_df = pd.merge(pos_df, lower_pos_df)

In [0]:
merged_pos_df

## Upper case

In [0]:
# Write recipe outputs
nouns_extracted_with_spacy = dataiku.Dataset("nouns_extracted_with_spacy")
nouns_extracted_with_spacy.write_with_schema(nouns_extracted_with_spacy_df)