In [1]:
import pandas as pd
from source.library.text_cleaning_simple import tokenize, remove_stop_words, prepare, \
    get_stop_words, get_n_grams
from source.library.text_preparation import clean, predict_language

un_debates = pd.read_csv('/code/artifacts/data/external/un-general-debates-blueprint.csv.zip')

In [2]:
un_debates.head()

Unnamed: 0,session,year,country,country_name,speaker,position,text
0,25,1970,ALB,Albania,Mr. NAS,,33: May I first convey to our President the co...
1,25,1970,ARG,Argentina,Mr. DE PABLO PARDO,,177.\t : It is a fortunate coincidence that pr...
2,25,1970,AUS,Australia,Mr. McMAHON,,100.\t It is a pleasure for me to extend to y...
3,25,1970,AUT,Austria,Mr. KIRCHSCHLAEGER,,155.\t May I begin by expressing to Ambassado...
4,25,1970,BEL,Belgium,Mr. HARMEL,,"176. No doubt each of us, before coming up to ..."


In [3]:
sample = un_debates.loc[0, 'text'][0:1000]
sample

'33: May I first convey to our President the congratulations of the Albanian delegation on his election to the Presidency of the twenty-fifth session of the General Assembly?\n34.\tIn taking up the work on the agenda of the twenty- fifth session of the General Assembly, which is being held on the eve of the twenty-fifth anniversary of the coming into force of the Charter of the United Nations, the peace-loving Member States would have wished to be in a position to present on this occasion some picture of positive and satisfactory activity on the part of the United Nations. The Albanian delegation, for its part, would have taken great pleasure in drawing up such a balance sheet of activities covering a quarter of a century, which is certainly no short period in the life of an international organization. Unfortunately, this is not the situation. Created on the day after victory had been achieved over the Powers of the Rome BerlinTokyo Axis and conceived in the spirit of the principles wh

# `text_preperation`

In [4]:
clean(
    text=sample,
    remove_angle_bracket_content=True,
    remove_bracket_content=True,
    replace_urls=" _URL_ ",
    replace_hashtags=" _TAG_ ",
    replace_numbers=" _NUMBER_ ",
    replace_user_handles=" _USER_ ",
    replace_emoji=" _EMOJI_ "
)

'_NUMBER_ : May I first convey to our President the congratulations of the Albanian delegation on his election to the Presidency of the twenty-fifth session of the General Assembly? _NUMBER_ . In taking up the work on the agenda of the twentyfifth session of the General Assembly, which is being held on the eve of the twenty-fifth anniversary of the coming into force of the Charter of the United Nations, the peace-loving Member States would have wished to be in a position to present on this occasion some picture of positive and satisfactory activity on the part of the United Nations. The Albanian delegation, for its part, would have taken great pleasure in drawing up such a balance sheet of activities covering a quarter of a century, which is certainly no short period in the life of an international organization. Unfortunately, this is not the situation. Created on the day after victory had been achieved over the Powers of the Rome BerlinTokyo Axis and conceived in the spirit of the pri

In [7]:
print(predict_language(text=sample))
print(predict_language(text=sample, return_language_code=True))

English
en


# `text_cleaning_simple`

In [17]:
prepare(sample)[0:10]  # converts to lower case, tokenizes, and removes stop words

['may',
 'first',
 'convey',
 'president',
 'congratulations',
 'albanian',
 'delegation',
 'election',
 'presidency',
 'twenty-fifth']

In [9]:
def _get_n_grams(text: str, n=2) -> list:
    tokens = prepare(text=text, pipeline=[str.lower, tokenize])
    return get_n_grams(tokens=tokens, n=n, stop_words=get_stop_words())

n_grams_2 = _get_n_grams(text=sample, n=2)
n_grams_2

['first convey',
 'albanian delegation',
 'twenty-fifth session',
 'general assembly',
 'twenty- fifth',
 'fifth session',
 'general assembly',
 'twenty-fifth anniversary',
 'united nations',
 'peace-loving member',
 'member states',
 'states would',
 'satisfactory activity',
 'united nations',
 'albanian delegation',
 'part would',
 'taken great',
 'great pleasure',
 'balance sheet',
 'activities covering',
 'short period',
 'international organization',
 'organization unfortunately',
 'situation created',
 'rome berlintokyo',
 'berlintokyo axis']

In [10]:
n_grams_3 = _get_n_grams(text=sample, n=3)
n_grams_3

['twenty- fifth session',
 'peace-loving member states',
 'member states would',
 'taken great pleasure',
 'international organization unfortunately',
 'rome berlintokyo axis']

# `spaCy`

In [20]:
from source.library.spacy import create_spacy_pipeline, extract_from_doc, custom_tokenizer
nlp = create_spacy_pipeline(
    stopwords_to_add={'may', 'regards'},
    #stopwords_to_remove={'down'},
    tokenizer=custom_tokenizer
)
doc = create_spacy_pipeline()(sample)
entities = extract_from_doc(doc)
entities.keys()

dict_keys(['all_lemmas', 'partial_lemmas', 'bi_grams', 'adjs_verbs', 'nouns', 'noun_phrases', 'entities'])

In [21]:
entities['all_lemmas'][0:10]

['33',
 'may',
 'i',
 'first',
 'convey',
 'to',
 'our',
 'president',
 'the',
 'congratulation']

In [22]:
entities['bi_grams']

['albanian delegation',
 'fifth session',
 'general assembly',
 'twenty- fifth',
 'fifth session',
 'general assembly',
 'fifth anniversary',
 'united nations',
 'love member',
 'member states',
 'satisfactory activity',
 'united nations',
 'albanian delegation',
 'take great',
 'great pleasure',
 'balance sheet',
 'activity cover',
 'short period',
 'international organization',
 'rome berlintokyo',
 'berlintokyo axis']

In [23]:
entities['entities']

['33 (CARDINAL)',
 'first (ORDINAL)',
 'albanian (NORP)',
 'general assembly (ORG)',
 '34 (CARDINAL)',
 'fifth (ORDINAL)',
 'general assembly (ORG)',
 'eve of the twenty - fifth (DATE)',
 'united nations (ORG)',
 'albanian (NORP)',
 'quarter of a century (CARDINAL)',
 'day (DATE)']