# Imports

In [1]:
%autosave 60

Autosaving every 60 seconds


In [2]:
import numpy
import seaborn
import matplotlib.pyplot as plt
import pandas
import pickle
from pathlib import Path
from dataclasses import dataclass, field
from tqdm import tqdm
import transformers # conda install tensorflow
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from IPython.display import display, HTML
from collections import deque
from multiprocessing import Pool
import os
from functools import partial
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Data ../cleaning/

It is necessary to define the class `Article` before loading the pickle file with the extracted sentences since the pickle file contains instances of this class.

In [3]:
@dataclass(frozen=True)
class Article:
    id: int
    language: str
    sentences: list[str] = field(repr=False)
    original_text: str = field(repr=False)
    number_of_sentences: int = field(init=False)
    persons: tuple[list] = field(init=False, repr=False)
    
    def __post_init__(self):
        object.__setattr__(self, 'number_of_sentences', len(self.sentences))
        object.__setattr__(self, 'persons', tuple([] for i in range(self.number_of_sentences))) # one list per sentence
    
    def render_html(self):
        """Print attribute `original_text` rendered in HTML"""
        display(HTML(self.original_text))

Read pickled file that stores the sentences of the articles

In [4]:
inputdir = Path('data')
file = 'portuguese_sentences.pickle'
with open(inputdir/file, 'rb') as file:
    articles = pickle.load(file)

Show info for the first 10 articles

In [5]:
articles[:10]

[Article(id=54006424, language='PORTUGUESE_BRAZIL', number_of_sentences=41),
 Article(id=54003004, language='PORTUGUESE_BRAZIL', number_of_sentences=34),
 Article(id=54009380, language='PORTUGUESE_BRAZIL', number_of_sentences=14),
 Article(id=54008153, language='PORTUGUESE_BRAZIL', number_of_sentences=22),
 Article(id=54009331, language='PORTUGUESE_BRAZIL', number_of_sentences=20),
 Article(id=54010979, language='PORTUGUESE_BRAZIL', number_of_sentences=32),
 Article(id=54011278, language='PORTUGUESE_BRAZIL', number_of_sentences=19),
 Article(id=54012392, language='PORTUGUESE_BRAZIL', number_of_sentences=161),
 Article(id=54015488, language='PORTUGUESE_BRAZIL', number_of_sentences=13),
 Article(id=54015413, language='PORTUGUESE_BRAZIL', number_of_sentences=19)]

# Named Entity Recognition (NER)

Select a model from the Hugging Face hub and load it along with its tokenizer and a token-classification head (NER).

In [6]:
checkpoint = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForTokenClassification.from_pretrained(checkpoint)

Use the pipeline for NER and group similar entities (e.g.: name and last name).

In [7]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Read the docs

In [8]:
nlp?

[0;31mSignature:[0m      [0mnlp[0m[0;34m([0m[0minputs[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m           TokenClassificationPipeline
[0;31mString form:[0m    <transformers.pipelines.token_classification.TokenClassificationPipeline object at 0x7fbf4b87ee90>
[0;31mFile:[0m           ~/opt/anaconda3/envs/s2ds/lib/python3.10/site-packages/transformers/pipelines/token_classification.py
[0;31mDocstring:[0m     
Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition
examples](../task_summary#named-entity-recognition) for more information.

This token recognition pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location or miscellan

In [12]:
pipeline?

[0;31mSignature:[0m
[0mpipeline[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtask[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m:[0m [0mOptional[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mconfig[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mtransformers[0m[0;34m.[0m[0mconfiguration_utils[0m[0;34m.[0m[0mPretrainedConfig[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtokenizer[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mtransformers[0m[0;34m.[0m[0mtokenization_utils[0m[0;34m.[0m[0mPreTrainedTokenizer[0m[0;34m,[0m [0mtransformers[0m[0;34m.[0m[0mtokenization_utils_fast[0m[0;34m.[0m[0mPreTrainedTokenizerFast[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfeature_extractor[0m

## Test

Try the NER on one sentence

In [9]:
text = articles[0].sentences[0]
text

'O presidente do FC Schalke 04 e proprietário de um frigorífico no centro de um novo surto de coronavírus na Alemanha, Clemens Tönnies, renunciou nesta terça-feira (30/06) ao cargo que ocupava há 19 anos no tradicional clube de futebol alemão após se envolver em uma série de controvérsias, ainda que em questões bastante diferentes.'

In [10]:
nlp(text)

[{'entity_group': 'ORG',
  'score': 0.9998289,
  'word': 'FC Schalke 04',
  'start': 16,
  'end': 29},
 {'entity_group': 'LOC',
  'score': 0.99983025,
  'word': 'Alemanha',
  'start': 111,
  'end': 119},
 {'entity_group': 'PER',
  'score': 0.99981546,
  'word': 'Clemens Tönnies',
  'start': 121,
  'end': 137}]

Extract only the names from the NER output

In [11]:
[group['word'] for group in nlp(text) if group['entity_group']=='PER']

['Clemens Tönnies']

## Check the first 10 articles 

(You may want to interrup the code in the next cell since it takes time to run)

In [26]:
for i1,article in enumerate(articles[:10]):
    for i2,sentence in enumerate(article.sentences):
        names = [group['word'] for group in nlp(sentence) if group['entity_group']=='PER' and group['score']>0.8]
        if '#' in ''.join(names):
            names = [] # Keep no names if NER includes the # character
        # if len(names)>0:
        #     print(f'article={i1}, sentence={i2}, {names=}')
        print(f'article={i1}, sentence={i2}, {names=}')
    

article=0, sentence=0, names=['Clemens Tönnies']
article=0, sentence=1, names=[]
article=0, sentence=2, names=[]
article=0, sentence=3, names=[]
article=0, sentence=4, names=[]
article=0, sentence=5, names=[]
article=0, sentence=6, names=['Gerald Asamoah', 'Hans Sarpei', 'Cacau']
article=0, sentence=7, names=[]
article=0, sentence=8, names=[]
article=0, sentence=9, names=[]
article=0, sentence=10, names=[]
article=0, sentence=11, names=[]
article=0, sentence=12, names=[]
article=0, sentence=13, names=[]
article=0, sentence=14, names=[]
article=0, sentence=15, names=[]
article=0, sentence=16, names=[]
article=0, sentence=17, names=[]
article=0, sentence=18, names=[]
article=0, sentence=19, names=[]
article=0, sentence=20, names=[]
article=0, sentence=21, names=[]
article=0, sentence=22, names=[]
article=0, sentence=23, names=[]
article=0, sentence=24, names=[]
article=0, sentence=25, names=[]
article=0, sentence=26, names=[]
article=0, sentence=27, names=[]
article=0, sentence=28, name

## Collect names in the dataclass

Let us try with only 2 articles

In [28]:
for i1,article in enumerate(articles[:2]):
    for i2,(sentence,persons_list) in enumerate(zip(article.sentences,article.persons)):
        persons_list.clear() # Enure we start with an empty list
        print(persons_list)
        names = [group['word'] for group in nlp(sentence) if group['entity_group']=='PER' and group['score']>0.8]
        if '#' in ''.join(names):
            names = [] # Keep no names if NER includes the # character
        persons_list.extend(names)
        print(f'article={i1}, sentence={i2}, {names=}, {persons_list=}')

[]
article=0, sentence=0, names=['Clemens Tönnies'], persons_list=['Clemens Tönnies']
[]
article=0, sentence=1, names=[], persons_list=[]
[]
article=0, sentence=2, names=[], persons_list=[]
[]
article=0, sentence=3, names=[], persons_list=[]
[]
article=0, sentence=4, names=[], persons_list=[]
[]
article=0, sentence=5, names=[], persons_list=[]
[]
article=0, sentence=6, names=['Gerald Asamoah', 'Hans Sarpei', 'Cacau'], persons_list=['Gerald Asamoah', 'Hans Sarpei', 'Cacau']
[]
article=0, sentence=7, names=[], persons_list=[]
[]
article=0, sentence=8, names=[], persons_list=[]
[]
article=0, sentence=9, names=[], persons_list=[]
[]
article=0, sentence=10, names=[], persons_list=[]
[]
article=0, sentence=11, names=[], persons_list=[]
[]
article=0, sentence=12, names=[], persons_list=[]
[]
article=0, sentence=13, names=[], persons_list=[]
[]
article=0, sentence=14, names=[], persons_list=[]
[]
article=0, sentence=15, names=[], persons_list=[]
[]
article=0, sentence=16, names=[], persons_l

Show information about the first articles

In [29]:
articles[:3]

[Article(id=54006424, language='PORTUGUESE_BRAZIL', number_of_sentences=41),
 Article(id=54003004, language='PORTUGUESE_BRAZIL', number_of_sentences=34),
 Article(id=54009380, language='PORTUGUESE_BRAZIL', number_of_sentences=14)]

Check sentences and persons correspondence for article 0

In [30]:
articles[1].sentences[0]

'Em meados de março, quando o coronavírus começou a se espalhar pela Europa, a chanceler federal alemã, Angela Merkel, deu um aviso implacável: "Este é o maior desafio que a União Europeia já enfrentou", disse.'

In [31]:
articles[1].persons[0]

['Angela Merkel']

Check sentences and persons correspondence for article 1

In [32]:
articles[1].sentences[1]

'Agora, à medida que os países-membros começam a relaxar o isolamento e reabrem as fronteiras, a atenção se volta à recuperação.'

In [33]:
articles[1].persons[1]

[]

Check sentences and persons correspondence for article 2

In [34]:
articles[1].sentences[2]

'Se existe algum país em condições de catalisar a integração fiscal necessária para estabilizar a Europa, esse país é a Alemanha.'

In [35]:
articles[1].persons[2]

[]

Show the number of sentences in the first article

In [36]:
len(articles[0].sentences)

41

# Collect names in all articles

## Parallel run (not used)

Not used because it seemed to take longer thatn the non-parallel version, maybe due to RAM or CPU resources (2 CPU and 4 GB RAM when I tried it).

In [37]:
os.environ["TOKENIZERS_PARALLELISM"] = "false" # set to "true" to parallelize (said by the transformers package)

Print the value of the environment variable 

In [22]:
! echo $TOKENIZERS_PARALLELISM

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
false


Parallelize

In [36]:
ncores = 2
_find_and_add_names = partial(find_and_add_names, ner=nlp) # alias of function with only one argument
with Pool(ncores) as pool:
    it = pool.imap_unordered(_find_and_add_names, articles)
    it = tqdm(it, total=len(articles))
    # Exhaust the iterator
    deque(it, maxlen=0)

  1%|          | 93/11848 [12:08<25:34:39,  7.83s/it]Process ForkPoolWorker-11:
Process ForkPoolWorker-12:
Traceback (most recent call last):
  File "/home/stefano/anaconda3/envs/py310/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/stefano/anaconda3/envs/py310/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/stefano/anaconda3/envs/py310/lib/python3.10/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/tmp/ipykernel_622/494436015.py", line 7, in find_and_add_names
    names = [group['word'] for group in ner(sentence) if group['entity_group']=='PER' and group['score']>score]
  File "/home/stefano/anaconda3/envs/py310/lib/python3.10/site-packages/transformers/pipelines/token_classification.py", line 192, in __call__
    return super().__call__(inputs, **kwargs)
  File "/home/stefano/anacond

KeyboardInterrupt: 

## Non-parallel run

The following cell took 16 hours to finish using a coputer with 2 CPUs and 2 (or 4) GB of RAM.

In [9]:
for article in tqdm(articles):
    for sentence,persons_list in zip(article.sentences,article.persons):
        persons_list.clear() # Enure we start with an empty list
        names = [group['word'] for group in nlp(sentence) if group['entity_group']=='PER' and group['score']>0.8]
        if '#' in ''.join(names):
            names = [] # Keep no names if NER includes the # character
        persons_list.extend(names)

100%|██████████| 11848/11848 [16:44:43<00:00,  5.09s/it]  


## Save results

In [17]:
outputdir = Path.cwd()
file = 'portuguese_sentences_with_names.pickle'

In [14]:
with open(outputdir/file, 'wb') as file:
    pickle.dump(articles, file)

# Read results and format as CSV

Read pickled file that stores the sentences of all articles

In [8]:
inputdir = Path.cwd()
file = Path('portuguese_sentences_with_names.pickle')
with open(inputdir/file, 'rb') as file:
    articles = pickle.load(file)

Show info ofr the first 10 articles

In [9]:
articles[:10]

[Article(id=54006424, language='PORTUGUESE_BRAZIL', number_of_sentences=41),
 Article(id=54003004, language='PORTUGUESE_BRAZIL', number_of_sentences=34),
 Article(id=54009380, language='PORTUGUESE_BRAZIL', number_of_sentences=14),
 Article(id=54008153, language='PORTUGUESE_BRAZIL', number_of_sentences=22),
 Article(id=54009331, language='PORTUGUESE_BRAZIL', number_of_sentences=20),
 Article(id=54010979, language='PORTUGUESE_BRAZIL', number_of_sentences=32),
 Article(id=54011278, language='PORTUGUESE_BRAZIL', number_of_sentences=19),
 Article(id=54012392, language='PORTUGUESE_BRAZIL', number_of_sentences=161),
 Article(id=54015488, language='PORTUGUESE_BRAZIL', number_of_sentences=13),
 Article(id=54015413, language='PORTUGUESE_BRAZIL', number_of_sentences=19)]

## Explore more in detail that everything looks good

Show the first 3 sentences in the article 7

In [69]:
articles[7].sentences[:3]

['Três vezes candidato a presidente da República, Ciro Gomes se prepara para tentar mais uma vez.',
 'Ele acaba de lançar olivro Projeto nacional: o dever da esperança, no qual analisa problemas estruturais do Brasil e apresenta propostas e que funcionará como um guia na caminhada até a próxima campanha nacional, daqui a cerca de dois anos.',
 'Questionado sobre o atual momento da política brasileira, ele afirma que "o que levou o Brasil a esse fundo do poço foi a decepção da esmagadora maioria da opinião pública com as práticas corruptas e a desastrada gestão econômica do PT" e defende um processo de impeachment contra o presidente Jair Bolsonaro.']

Show the names foundn in the first 3 sentences in the article 7

In [13]:
articles[7].persons[:3]

(['Ciro Gomes'], [], ['Jair Bolsonaro'])

Show all names found in article 0

In [20]:
articles[0].persons

(['Clemens Tönnies'],
 [],
 [],
 [],
 [],
 [],
 ['Gerald Asamoah', 'Hans Sarpei', 'Cacau'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['Clemens Tönnies'],
 [])

## Convert to a Pandas DataFrame

In [71]:
dfs = []
for article in tqdm(articles):
    for sentence,persons_list in zip(article.sentences,article.persons):
        if len(persons_list)==0:
            persons_list = [""] # convert empty to a list with an empty string so that it is always a string
        df = pd.DataFrame({
            'id':article.id,
            'language':article.language,
            'sentence':sentence,
            'name':persons_list
        })
        dfs.append(df)

100%|██████████| 11848/11848 [01:35<00:00, 124.26it/s]


In [82]:
df = pd.concat(dfs, ignore_index=True)

Check result

In [74]:
df.head(10)

Unnamed: 0,id,language,sentence,name
0,54006424,PORTUGUESE_BRAZIL,O presidente do FC Schalke 04 e proprietário ...,Clemens Tönnies
1,54006424,PORTUGUESE_BRAZIL,O Schalke não conseguiu nenhuma vitória nos ...,
2,54006424,PORTUGUESE_BRAZIL,"A equipe dos ""azuis-reais"" terminou o campeona...",
3,54006424,PORTUGUESE_BRAZIL,A participação em uma dessas grandes competi...,
4,54006424,PORTUGUESE_BRAZIL,Tönnies foi amplamente criticado no ano passa...,
5,54006424,PORTUGUESE_BRAZIL,"No mês de agosto, durante um evento do clube,...",
6,54006424,PORTUGUESE_BRAZIL,O comentário foi criticado como racista por e...,Gerald Asamoah
7,54006424,PORTUGUESE_BRAZIL,O comentário foi criticado como racista por e...,Hans Sarpei
8,54006424,PORTUGUESE_BRAZIL,O comentário foi criticado como racista por e...,Cacau
9,54006424,PORTUGUESE_BRAZIL,"Mais tarde, Tönnies se desculpou pela afirmac...",


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386075 entries, 0 to 386074
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        386075 non-null  int64 
 1   language  386075 non-null  object
 2   sentence  386075 non-null  object
 3   name      386075 non-null  object
dtypes: int64(1), object(3)
memory usage: 11.8+ MB


In [38]:
df.describe(include='all')[:4]

Unnamed: 0,id,language,sentence,name
count,386075.0,386075,386075,386075.0
unique,,1,341975,27952.0
top,,PORTUGUESE_BRAZIL,Diversas autoridades e instituições de saúd...,
freq,,386075,267,260633.0


In [76]:
df.query('name!=""').head(10)

Unnamed: 0,id,language,sentence,name
0,54006424,PORTUGUESE_BRAZIL,O presidente do FC Schalke 04 e proprietário ...,Clemens Tönnies
6,54006424,PORTUGUESE_BRAZIL,O comentário foi criticado como racista por e...,Gerald Asamoah
7,54006424,PORTUGUESE_BRAZIL,O comentário foi criticado como racista por e...,Hans Sarpei
8,54006424,PORTUGUESE_BRAZIL,O comentário foi criticado como racista por e...,Cacau
41,54006424,PORTUGUESE_BRAZIL,A revolta gerada pelo papel da empresa no ress...,Clemens Tönnies
43,54003004,PORTUGUESE_BRAZIL,"Em meados de março, quando o coronavírus com...",Angela Merkel
51,54003004,PORTUGUESE_BRAZIL,"E após anos mantendo o presidente francês, E...",Emmanuel Macron
52,54003004,PORTUGUESE_BRAZIL,"E após anos mantendo o presidente francês, E...",Merkel
55,54003004,PORTUGUESE_BRAZIL,"O ministro alemão das Finanças, Olaf Scholz,...",Olaf Scholz
56,54003004,PORTUGUESE_BRAZIL,"O ministro alemão das Finanças, Olaf Scholz,...",Alexander Hamilton


In [40]:
df.query('name!=""').describe(include='all')[:4]

Unnamed: 0,id,language,sentence,name
count,125442.0,125442,125442,125442
unique,,1,93353,27951
top,,PORTUGUESE_BRAZIL,"O texto reflete a opinião do autor, não nece...",Bolsonaro
freq,,125442,107,7996


## Save

In [77]:
outputdir = Path.cwd()
filename = 'portuguese_sentences_with_names.csv'
df.to_csv(outputdir/filename, index=False)

## Remove redundancy

Read data

In [41]:
inputdir = Path.cwd()
filename = 'portuguese_sentences_with_names.csv'
dtype={'id':int,
       'language':str,
       'sentence':str,
       'name':str
       }

df = pd.read_csv(inputdir/filename, dtype=dtype).fillna('')

In [47]:
df

Unnamed: 0,id,language,sentence,name
0,54006424,PORTUGUESE_BRAZIL,O presidente do FC Schalke 04 e proprietário ...,Clemens Tönnies
1,54006424,PORTUGUESE_BRAZIL,O Schalke não conseguiu nenhuma vitória nos ...,
2,54006424,PORTUGUESE_BRAZIL,"A equipe dos ""azuis-reais"" terminou o campeona...",
3,54006424,PORTUGUESE_BRAZIL,A participação em uma dessas grandes competi...,
4,54006424,PORTUGUESE_BRAZIL,Tönnies foi amplamente criticado no ano passa...,
...,...,...,...,...
386070,54006316,PORTUGUESE_BRAZIL,Na avaliação por grupo de 100 mil habitantes...,
386071,54006316,PORTUGUESE_BRAZIL,"O Rio de Janeiro, a segunda unidade da federac...",
386072,54006316,PORTUGUESE_BRAZIL,O estado mais afetado continua sendo São Paul...,
386073,54006316,PORTUGUESE_BRAZIL,"Em números acumulados, o Brasil tem 790.040 p...",


Drop unnecessary columns

In [48]:
df = df.drop(columns=['language'])

Remove sentences with no found names

In [49]:
df = df.query('name!=""')

Reset index

In [50]:
df = df.reset_index(drop=True)

In [51]:
df

Unnamed: 0,id,sentence,name
0,54006424,O presidente do FC Schalke 04 e proprietário ...,Clemens Tönnies
1,54006424,O comentário foi criticado como racista por e...,Gerald Asamoah
2,54006424,O comentário foi criticado como racista por e...,Hans Sarpei
3,54006424,O comentário foi criticado como racista por e...,Cacau
4,54006424,A revolta gerada pelo papel da empresa no ress...,Clemens Tönnies
...,...,...,...
125437,54006072,"Paulo, o nome mais cotado para substituir Deco...",Anderson Correia
125438,54006072,Também seriam possíveis candidatos o secreta...,Renato Feder
125439,54006072,Também seriam possíveis candidatos o secreta...,Sérgio
125440,54006072,O nome de Freitas é relacionado como orientad...,Freitas


In [None]:
outputdir = Path.cwd()
filename = 'portuguese_sentences_with_names_tight.csv'
df.to_csv(outputdir/filename, index=False)

# Homogenize names (alias) in sentences

Read CSV file

In [527]:
inputdir = Path('/home/stefano')
filename = Path('portuguese_sentences_with_names_tight.csv')
dtype={'id':int,
       'sentence':str,
       'name':str
       }
df = pd.read_csv(inputdir/filename, dtype=dtype)

In [528]:
df.head(10)

Unnamed: 0,id,sentence,name
0,54006424,O presidente do FC Schalke 04 e proprietário ...,Clemens Tönnies
1,54006424,O comentário foi criticado como racista por e...,Gerald Asamoah
2,54006424,O comentário foi criticado como racista por e...,Hans Sarpei
3,54006424,O comentário foi criticado como racista por e...,Cacau
4,54006424,A revolta gerada pelo papel da empresa no ress...,Clemens Tönnies
5,54003004,"Em meados de março, quando o coronavírus com...",Angela Merkel
6,54003004,"E após anos mantendo o presidente francês, E...",Emmanuel Macron
7,54003004,"E após anos mantendo o presidente francês, E...",Merkel
8,54003004,"O ministro alemão das Finanças, Olaf Scholz,...",Olaf Scholz
9,54003004,"O ministro alemão das Finanças, Olaf Scholz,...",Alexander Hamilton


In [529]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125442 entries, 0 to 125441
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        125442 non-null  int64 
 1   sentence  125442 non-null  object
 2   name      125442 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.9+ MB


In [530]:
df.iloc[0]

id                                                   54006424
sentence    O presidente do FC Schalke 04 e proprietário ...
name                                         Clemens Tönnies
Name: 0, dtype: object

## Find sentences with multiple names in it.

In [531]:
# Group by article and sentence to find all names in a sentence
df_grouped = df.groupby(['id','sentence']).agg({'name':lambda x: list(x)}).reset_index()

In [520]:
df_grouped

Unnamed: 0,id,sentence,name
0,36728631,"E você tem Elon, que tem uma abordagem mais p...",[Elon]
1,36728631,"E, falando com Richard Branson, ele teve esses...","[Richard Branson, Peter]"
2,36728631,Ele se encontrou com Elon Musk quando Elon est...,"[Elon Musk, Elon, Richard Branson]"
3,36728631,Jeff estava decidido a fazer tudo o que ele pu...,[Jeff]
4,36728631,"Julian Guthrie, autora do livro How to make a ...",[Julian Guthrie]
...,...,...,...
97739,62463689,"John Bolton, ex-assessor de segurança naciona...","[John Bolton, Donald Trump]"
97740,62463689,Mencionou apenas o malsucedido golpe na Venezu...,"[Nicolás Maduro, Juan Guaido]"
97741,62463689,"O ex-assessor também considerou ""risível"" ac...",[Trump]
97742,62463689,Os comentários de Bolton sobre os distúrbios...,"[Bolton, Trump]"


## Delete row in the DataFrame where name appears in short version

**Example**

In [532]:
# s = 'E você tem Elon, que tem uma abordagem mais pragmática em "escapar" do planeta.'
s = 'Ele se encontrou com Elon Musk quando Elon estava apenas pensando em ir ao espaço, e ele estava se reunindo com Richard Branson.'
id = 36728631

In [533]:
df.query('id==@id and sentence==@s')

Unnamed: 0,id,sentence,name
121359,36728631,Ele se encontrou com Elon Musk quando Elon est...,Elon Musk
121360,36728631,Ele se encontrou com Elon Musk quando Elon est...,Elon
121361,36728631,Ele se encontrou com Elon Musk quando Elon est...,Richard Branson


In [534]:
df.query('id==@id and sentence==@s').name.tolist()

['Elon Musk', 'Elon', 'Richard Branson']

In the sentence above, we will delete the row whose name column puts "Elon" because there is another row whose name column puts "Elon Musk".

## Function

In [535]:
def is_subname(name:str, names:list[str]) -> bool:
    """Check if `name` is a substring of (or is contained in) any name in `names`."""
    
    # Check types
    if not isinstance(name,str):
        TypeError('`name` must be a string or a list whose elements are strings.')
    if not isinstance(names,list) or not all(isinstance(n,str) for n in names):
        raise TypeError('`names` must be a list of strings strings.')
    
    # Main
    for potential_composed_name in names:
        if name != potential_composed_name:
            if name in potential_composed_name:
                return True
    return False

In [536]:
indexes = []
# sample = df_test.iloc[121351:121369] # test line
# pbar = tqdm(sample.itertuples(), total=len(sample.index)) # test line
pbar = tqdm(df.itertuples(), total=len(df.index))
for row in pbar:
    query = 'id==@row.id and sentence==@row.sentence' # Here I assume that the pair row.id and row.sentence are always unique in df
    names = df.query(query).name.tolist()
    if is_subname(row.name,names):
        indexes.append(row.Index)

100%|██████████| 125442/125442 [36:29<00:00, 57.30it/s] 


## Check results

In [303]:
len(indexes)

393

In [538]:
indexes[:10]

[98, 463, 687, 931, 993, 1090, 1255, 1286, 1289, 1514]

In [315]:
i = 98
df[i-2:i+2]

Unnamed: 0,id,sentence,name
96,54018196,As investigações atuais são sobre corrupça...,Queiroz
97,54018196,"Nos últimos anos, no entanto, foi sobretudo o...",Bolsonaro
98,54018196,Bolsonaro venceu a campanha eleitoral de 2018 ...,Bolsonaro
99,54018196,Bolsonaro venceu a campanha eleitoral de 2018 ...,Carlos Bolsonaro


In [316]:
i = 463
df[i-2:i+2]

Unnamed: 0,id,sentence,name
461,54040436,"""Concedemos esta autorização menos de um mê...",Stella Kyriakides
462,54041511,"Mirando o presidente dos EUA, Donald Trump, um...",Donald Trump
463,54041511,"Mirando o presidente dos EUA, Donald Trump, um...",Donald
464,54044041,Às 10 horas da manhã desta sexta-feira (03/0...,Matignon


In [317]:
i = 687
df[i-2:i+2]

Unnamed: 0,id,sentence,name
685,54064053,E entre os mais fiéis existe incômodo com ca...,Bolsonaro
686,54064053,Também veem como ponto de instabilidade a atu...,Flávio Bolsonaro
687,54064053,Também veem como ponto de instabilidade a atu...,Bolsonaro
688,54064053,Isso aparece nos três grupos com intensidade ...,Bolsonaro


## Apply results

Before application

In [318]:
i = 98
df[i-2:i+2]

Unnamed: 0,id,sentence,name
96,54018196,As investigações atuais são sobre corrupça...,Queiroz
97,54018196,"Nos últimos anos, no entanto, foi sobretudo o...",Bolsonaro
98,54018196,Bolsonaro venceu a campanha eleitoral de 2018 ...,Bolsonaro
99,54018196,Bolsonaro venceu a campanha eleitoral de 2018 ...,Carlos Bolsonaro


Application

In [320]:
df = df.drop(index=indexes).reset_index(drop=True)

After application

In [321]:
i = 98
df[i-2:i+2]

Unnamed: 0,id,sentence,name
96,54018196,As investigações atuais são sobre corrupça...,Queiroz
97,54018196,"Nos últimos anos, no entanto, foi sobretudo o...",Bolsonaro
98,54018196,Bolsonaro venceu a campanha eleitoral de 2018 ...,Carlos Bolsonaro
99,54018196,"Desde então, o tom nas redes favoráveis a Bo...",Bolsonaro


In [322]:
df.index.size

125049

In [323]:
125442-df.index.size

393

In [324]:
len(indexes)

393

## Save

In [540]:
outputdir = Path.cwd()
filename = 'portuguese_sentences_with_names_v2.csv'
df.to_csv(outputdir/filename, index=False)

# Remove duplicated names in sentences

Read CSV file

In [541]:
inputdir = Path.cwd()
filename = Path('portuguese_sentences_with_names_v2.csv')
dtype={'id':int,
       'sentence':str,
       'name':str
       }
df = pd.read_csv(inputdir/filename, dtype=dtype)

In [327]:
df.head(10)

Unnamed: 0,id,sentence,name
0,54006424,O presidente do FC Schalke 04 e proprietário ...,Clemens Tönnies
1,54006424,O comentário foi criticado como racista por e...,Gerald Asamoah
2,54006424,O comentário foi criticado como racista por e...,Hans Sarpei
3,54006424,O comentário foi criticado como racista por e...,Cacau
4,54006424,A revolta gerada pelo papel da empresa no ress...,Clemens Tönnies
5,54003004,"Em meados de março, quando o coronavírus com...",Angela Merkel
6,54003004,"E após anos mantendo o presidente francês, E...",Emmanuel Macron
7,54003004,"E após anos mantendo o presidente francês, E...",Merkel
8,54003004,"O ministro alemão das Finanças, Olaf Scholz,...",Olaf Scholz
9,54003004,"O ministro alemão das Finanças, Olaf Scholz,...",Alexander Hamilton


In [542]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125442 entries, 0 to 125441
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        125442 non-null  int64 
 1   sentence  125442 non-null  object
 2   name      125442 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.9+ MB


In [543]:
df.iloc[0]

id                                                   54006424
sentence    O presidente do FC Schalke 04 e proprietário ...
name                                         Clemens Tönnies
Name: 0, dtype: object

## Find sentences with multiple names in it

In [544]:
pd.concat(g for _, g in df.groupby(['id','sentence','name']) if len(g) > 1)

Unnamed: 0,id,sentence,name
121366,36728631,"Peter é também pragmático, mas a abordagem ...",Peter
121369,36728631,"Peter é também pragmático, mas a abordagem ...",Peter
101397,51850454,O cientista político Jairo Nicolau lembra que...,Bolsonaro
101398,51850454,O cientista político Jairo Nicolau lembra que...,Bolsonaro
101575,51865798,"""Se o senhor Ghosn viesse para a França, não...",Ghosn
...,...,...,...
76214,62426373,Arruda foi assassinado a tiros na noite de sá...,Arruda
76334,62436456,"No entanto, após Moro romper com Bolsonaro em...",Moro
76337,62436456,"No entanto, após Moro romper com Bolsonaro em...",Moro
76762,62463689,"""É Donald Trump cuidando de Donald Trump.",Donald Trump


Check a few ones

In [546]:
i = 121366
df.loc[i-3:i+3]

Unnamed: 0,id,sentence,name
121363,36728631,"E, falando com Richard Branson, ele teve esses...",Richard Branson
121364,36728631,"E, falando com Richard Branson, ele teve esses...",Peter
121365,36728631,"E você tem Elon, que tem uma abordagem mais p...",Elon
121366,36728631,"Peter é também pragmático, mas a abordagem ...",Peter
121367,36728631,"Peter é também pragmático, mas a abordagem ...",Elon
121368,36728631,"Peter é também pragmático, mas a abordagem ...",Jeff Bezos
121369,36728631,"Peter é também pragmático, mas a abordagem ...",Peter


In [548]:
i = 101397
df.loc[i-3:i+3]

Unnamed: 0,id,sentence,name
101394,51850454,"Para Nogueira, as áreas de cultura, meio ambi...",Ernesto Araújo
101395,51850454,"Para Nogueira, essa foi a tendência de 2019 e...",Nogueira
101396,51850454,O cientista político Jairo Nicolau lembra que...,Jairo Nicolau
101397,51850454,O cientista político Jairo Nicolau lembra que...,Bolsonaro
101398,51850454,O cientista político Jairo Nicolau lembra que...,Bolsonaro
101399,51850454,O cientista político Jairo Nicolau lembra que...,Viktor Orba
101400,51850454,"""A retórica continua terrível, antidemocrát...",Nicolau


In [549]:
i = 101575
df.loc[i-3:i+3]

Unnamed: 0,id,sentence,name
101572,51865798,Antes de pousar com seu jato particular na cap...,Ghosn
101573,51865798,"Agora, essa etapa da jornada também tem conse...",Ghosn
101574,51865798,Fugindo das autoridades japonesas e possivelme...,Ghosn
101575,51865798,"""Se o senhor Ghosn viesse para a França, não...",Ghosn
101576,51865798,"""Se o senhor Ghosn viesse para a França, não...",Ghosn
101577,51865798,"""Se o senhor Ghosn viesse para a França, não...",Agnès Pannier - Runacher
101578,51865798,As autoridades francesas disseram ter tido con...,Ghosn


Get the index of all repetead names (a.k.a. indexes we attempt to remove from the DataFrame) per sentence except for the first appareance.

All indexes

In [None]:
pd.concat(g for _, g in df.groupby(['id','sentence','name']) if len(g) > 1)

Unnamed: 0,id,sentence,name
121366,36728631,"Peter é também pragmático, mas a abordagem ...",Peter
121369,36728631,"Peter é também pragmático, mas a abordagem ...",Peter
101397,51850454,O cientista político Jairo Nicolau lembra que...,Bolsonaro
101398,51850454,O cientista político Jairo Nicolau lembra que...,Bolsonaro
101575,51865798,"""Se o senhor Ghosn viesse para a França, não...",Ghosn
...,...,...,...
76214,62426373,Arruda foi assassinado a tiros na noite de sá...,Arruda
76334,62436456,"No entanto, após Moro romper com Bolsonaro em...",Moro
76337,62436456,"No entanto, após Moro romper com Bolsonaro em...",Moro
76762,62463689,"""É Donald Trump cuidando de Donald Trump.",Donald Trump


Target indexes

In [554]:
pd.concat(g.iloc[1:] for _, g in df.groupby(['id','sentence','name']) if len(g) > 1)

Unnamed: 0,id,sentence,name
121369,36728631,"Peter é também pragmático, mas a abordagem ...",Peter
101398,51850454,O cientista político Jairo Nicolau lembra que...,Bolsonaro
101576,51865798,"""Se o senhor Ghosn viesse para a França, não...",Ghosn
102155,51919375,A Nissan assegurou nesta terça-feira que a fu...,Ghosn
102215,51921279,"""O assassinato de Soleimani não muda nada na ...",Soleimani
...,...,...,...
75656,62401502,A União das Organizações Indígenas do Vale...,Pereira
75657,62401502,A União das Organizações Indígenas do Vale...,Phillips
76214,62426373,Arruda foi assassinado a tiros na noite de sá...,Arruda
76337,62436456,"No entanto, após Moro romper com Bolsonaro em...",Moro


Save the indexes

In [555]:
indexes = pd.concat(g.iloc[1:] for _, g in df.groupby(['id','sentence','name']) if len(g) > 1).index
indexes

Int64Index([121369, 101398, 101576, 102155, 102215, 102225, 103347, 103348,
            103349, 103452,
            ...
             74001,  74295,  75111,  75555,  75623,  75656,  75657,  76214,
             76337,  76763],
           dtype='int64', length=604)

Remove indexes

In [556]:
df = df.drop(index=indexes).reset_index(drop=True)

Check results

In [557]:
df

Unnamed: 0,id,sentence,name
0,54006424,O presidente do FC Schalke 04 e proprietário ...,Clemens Tönnies
1,54006424,O comentário foi criticado como racista por e...,Gerald Asamoah
2,54006424,O comentário foi criticado como racista por e...,Hans Sarpei
3,54006424,O comentário foi criticado como racista por e...,Cacau
4,54006424,A revolta gerada pelo papel da empresa no ress...,Clemens Tönnies
...,...,...,...
124833,54006072,"Paulo, o nome mais cotado para substituir Deco...",Anderson Correia
124834,54006072,Também seriam possíveis candidatos o secreta...,Renato Feder
124835,54006072,Também seriam possíveis candidatos o secreta...,Sérgio
124836,54006072,O nome de Freitas é relacionado como orientad...,Freitas


Confirm result

In [558]:
[g for _, g in df.groupby(['id','sentence','name']) if len(g) > 1]

[]

In [559]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124838 entries, 0 to 124837
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        124838 non-null  int64 
 1   sentence  124838 non-null  object
 2   name      124838 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.9+ MB


## Save

In [560]:
outputdir = Path.cwd()
filename = 'portuguese_sentences_with_names_v3.csv'
df.to_csv(outputdir/filename, index=False)

# Homogenize names (alias) in articles

Read CSV file

In [595]:
inputdir = Path.cwd()
filename = 'portuguese_sentences_with_names_v3.csv'
dtype={'id':int,
       'sentence':str,
       'name':str
       }
df = pd.read_csv(inputdir/filename, dtype=dtype)

In [596]:
df.head(10)

Unnamed: 0,id,sentence,name
0,54006424,O presidente do FC Schalke 04 e proprietário ...,Clemens Tönnies
1,54006424,O comentário foi criticado como racista por e...,Gerald Asamoah
2,54006424,O comentário foi criticado como racista por e...,Hans Sarpei
3,54006424,O comentário foi criticado como racista por e...,Cacau
4,54006424,A revolta gerada pelo papel da empresa no ress...,Clemens Tönnies
5,54003004,"Em meados de março, quando o coronavírus com...",Angela Merkel
6,54003004,"E após anos mantendo o presidente francês, E...",Emmanuel Macron
7,54003004,"E após anos mantendo o presidente francês, E...",Merkel
8,54003004,"O ministro alemão das Finanças, Olaf Scholz,...",Olaf Scholz
9,54003004,"O ministro alemão das Finanças, Olaf Scholz,...",Alexander Hamilton


In [597]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124838 entries, 0 to 124837
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        124838 non-null  int64 
 1   sentence  124838 non-null  object
 2   name      124838 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.9+ MB


In [598]:
df.iloc[0]

id                                                   54006424
sentence    O presidente do FC Schalke 04 e proprietário ...
name                                         Clemens Tönnies
Name: 0, dtype: object

## Remove column for sentence

In [599]:
df = df.drop(columns=['sentence'])
df

Unnamed: 0,id,name
0,54006424,Clemens Tönnies
1,54006424,Gerald Asamoah
2,54006424,Hans Sarpei
3,54006424,Cacau
4,54006424,Clemens Tönnies
...,...,...
124833,54006072,Anderson Correia
124834,54006072,Renato Feder
124835,54006072,Sérgio
124836,54006072,Freitas


## Find articles with multiple names in it.

In [600]:
df_grouped = df.groupby(['id']).agg({'name':lambda x: list(x)})
df_grouped

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
36728631,"[Julian Guthrie, Julian Guthrie, Peter Diamand..."
51740080,"[Francisco, Dionísio Kuduavicz, Dionísio, Jo..."
51846913,"[Donald Trump, Jair Bolsonaro, Trump, Bolsonar..."
51850454,"[Jair Messias Bolsonaro, Michel Temer, Bolsona..."
51850907,[Massa]
...,...
62461355,"[Trump, Stephanie Murphy, Murphy, Trump, Cassi..."
62461496,"[Marcos do Val, Jair Bolsonaro, Rodrigo Pachec..."
62462074,"[Joe Biden, Avi Avisana, Yair Lapid, Naftali B..."
62463146,"[Gergely Gulyas, Viktor Orban, Gulyas, Gulyas,..."


Check some articles (ids)

In [601]:
id = 36728631
names = df.query('id==@id').name.tolist()
print(f'Number of names = {len(names)}')
names

Number of names = 22


['Julian Guthrie',
 'Julian Guthrie',
 'Peter Diamandis',
 'Peter',
 'Peter',
 'Jeff Bezos',
 'Elon Musk',
 'Elon',
 'Richard Branson',
 'Peter Diamandis',
 'Richard Branson',
 'Peter',
 'Elon',
 'Peter',
 'Elon',
 'Jeff Bezos',
 'Jeff',
 'Peter',
 'Burt Rutan',
 'Dumitru Popescu',
 'Steve Bennett',
 'Elon Musk']

## Delete row in the DataFrame where name appears in short version

In [568]:
indexes = []
# sample = df_test.iloc[121351:121369] # test line
# pbar = tqdm(sample.itertuples(), total=len(sample.index)) # test line
pbar = tqdm(df.itertuples(), total=len(df.index))
for row in pbar:
    query = 'id==@row.id'
    names = df.query(query).name.tolist()
    if is_subname(row.name,names):
        indexes.append(row.Index)

100%|██████████| 124838/124838 [06:33<00:00, 316.89it/s]


## Check results

In [602]:
len(indexes)

56775

In [603]:
indexes[:10]

[7, 10, 12, 14, 15, 17, 19, 20, 31, 36]

In [604]:
i = 7
df.loc[i-10:i+10]

Unnamed: 0,id,name
0,54006424,Clemens Tönnies
1,54006424,Gerald Asamoah
2,54006424,Hans Sarpei
3,54006424,Cacau
4,54006424,Clemens Tönnies
5,54003004,Angela Merkel
6,54003004,Emmanuel Macron
7,54003004,Merkel
8,54003004,Olaf Scholz
9,54003004,Alexander Hamilton


In [605]:
indexes[200:210]

[467, 468, 469, 472, 473, 476, 477, 478, 479, 488]

In [606]:
i = 467
df.loc[i-10:i+10]

Unnamed: 0,id,name
457,54040436,Stella Kyriakides
458,54041511,Donald Trump
459,54041511,Donald
460,54044041,Matignon
461,54044041,Édouard Philippe
462,54044041,Emmanuel Macron
463,54044041,Jean Castex
464,54044041,Philippe
465,54044041,Macron
466,54044041,Philippe


## Apply results

Before application

In [607]:
i = 467
df.loc[i-10:i+10]

Unnamed: 0,id,name
457,54040436,Stella Kyriakides
458,54041511,Donald Trump
459,54041511,Donald
460,54044041,Matignon
461,54044041,Édouard Philippe
462,54044041,Emmanuel Macron
463,54044041,Jean Castex
464,54044041,Philippe
465,54044041,Macron
466,54044041,Philippe


Check application without making permanent changes

In [608]:
i = 467
df.drop(index=indexes).loc[i-10:i+10]

Unnamed: 0,id,name
457,54040436,Stella Kyriakides
458,54041511,Donald Trump
460,54044041,Matignon
461,54044041,Édouard Philippe
462,54044041,Emmanuel Macron
463,54044041,Jean Castex
470,54044041,Jean Castex
471,54044041,Nicolas Sarkozy
474,54044041,Florence Parly
475,54044041,Jean - Yves Drian


Application

In [609]:
df = df.drop(index=indexes)

After application

In [610]:
i = 467
df.loc[i-10:i+10]

Unnamed: 0,id,name
457,54040436,Stella Kyriakides
458,54041511,Donald Trump
460,54044041,Matignon
461,54044041,Édouard Philippe
462,54044041,Emmanuel Macron
463,54044041,Jean Castex
470,54044041,Jean Castex
471,54044041,Nicolas Sarkozy
474,54044041,Florence Parly
475,54044041,Jean - Yves Drian


Reset indexes

In [611]:
df = df.reset_index(drop=True)

In [615]:
df.head(10)

Unnamed: 0,id,name
0,54006424,Clemens Tönnies
1,54006424,Gerald Asamoah
2,54006424,Hans Sarpei
3,54006424,Cacau
4,54006424,Clemens Tönnies
5,54003004,Angela Merkel
6,54003004,Emmanuel Macron
7,54003004,Olaf Scholz
8,54003004,Alexander Hamilton
9,54003004,Alexander Hamilton


## Save

In [618]:
outputdir = Path.cwd()
filename = 'portuguese_articles_and_names_v2.csv'
df.to_csv(outputdir/filename, index=False)

# Remove duplicated names in articles

Read CSV file

In [619]:
inputdir = Path.cwd()
filename = 'portuguese_articles_and_names_v2.csv'
dtype={'id':int,
       'name':str
       }
df = pd.read_csv(inputdir/filename, dtype=dtype)

In [620]:
df.head(10)

Unnamed: 0,id,name
0,54006424,Clemens Tönnies
1,54006424,Gerald Asamoah
2,54006424,Hans Sarpei
3,54006424,Cacau
4,54006424,Clemens Tönnies
5,54003004,Angela Merkel
6,54003004,Emmanuel Macron
7,54003004,Olaf Scholz
8,54003004,Alexander Hamilton
9,54003004,Alexander Hamilton


In [621]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68063 entries, 0 to 68062
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      68063 non-null  int64 
 1   name    68063 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


In [622]:
df.iloc[0]

id              54006424
name    Clemens Tönnies
Name: 0, dtype: object

## Find articles with multiple names in it

In [623]:
pd.concat(g for _, g in df.groupby(['id','name']) if len(g) > 1)

Unnamed: 0,id,name
65706,36728631,Elon Musk
65714,36728631,Elon Musk
65705,36728631,Jeff Bezos
65710,36728631,Jeff Bezos
65702,36728631,Julian Guthrie
...,...,...
41469,62463689,Donald Trump
41467,62463689,John Bolton
41473,62463689,John Bolton
41470,62463689,Nicolás Maduro


Check a few ones

In [632]:
i = 65706
print(f'Repeated name: {df.loc[i]["name"]}')
print(f'In article: {df.loc[i]["id"]}')
df.loc[i-10:i+10]

Repeated name: Elon Musk
In article: 36728631


Unnamed: 0,id,name
65696,53631101,Dominic Raab
65697,53631101,Priti Patel
65698,53622574,Josephine
65699,53622574,Chang Heung - Lin
65700,53622574,Dominic Raab
65701,53622574,Priti Patel
65702,36728631,Julian Guthrie
65703,36728631,Julian Guthrie
65704,36728631,Peter Diamandis
65705,36728631,Jeff Bezos


In [633]:
i = 41469
print(f'Repeated name: {df.loc[i]["name"]}')
print(f'In article: {df.loc[i]["id"]}')
df.loc[i-10:i+10]

Repeated name: Donald Trump
In article: 62463689


Unnamed: 0,id,name
41459,62462074,Mahmoud Abbas
41460,62462074,Jesus
41461,62462074,Simon Rishmawi
41462,62462074,Miral Assaf
41463,62462074,Shireen Abu Akleh
41464,62463146,Gergely Gulyas
41465,62463146,Viktor Orban
41466,62463146,Peter Szijjarto
41467,62463689,John Bolton
41468,62463689,Donald Trump


Get the index of all repetead names (a.k.a. indexes we attempt to remove from the DataFrame) per sentence except for the first appareance.

All indexes

In [634]:
pd.concat(g for _, g in df.groupby(['id','name']) if len(g) > 1)

Unnamed: 0,id,name
65706,36728631,Elon Musk
65714,36728631,Elon Musk
65705,36728631,Jeff Bezos
65710,36728631,Jeff Bezos
65702,36728631,Julian Guthrie
...,...,...
41469,62463689,Donald Trump
41467,62463689,John Bolton
41473,62463689,John Bolton
41470,62463689,Nicolás Maduro


Target indexes

In [635]:
pd.concat(g.iloc[1:] for _, g in df.groupby(['id','name']) if len(g) > 1)

Unnamed: 0,id,name
65714,36728631,Elon Musk
65710,36728631,Jeff Bezos
65703,36728631,Julian Guthrie
65708,36728631,Peter Diamandis
65709,36728631,Richard Branson
...,...,...
41446,62461496,Marcos do Val
41478,62463689,Bush
41469,62463689,Donald Trump
41473,62463689,John Bolton


Save the indexes

In [636]:
indexes = pd.concat(g.iloc[1:] for _, g in df.groupby(['id','name']) if len(g) > 1).index
indexes

Int64Index([65714, 65710, 65703, 65708, 65709, 54954, 54946, 54947, 54948,
            54951,
            ...
            41430, 41433, 41436, 41437, 41440, 41446, 41478, 41469, 41473,
            41472],
           dtype='int64', length=10794)

Remove indexes

In [637]:
df = df.drop(index=indexes).reset_index(drop=True)

Check results

In [640]:
df.head(20)

Unnamed: 0,id,name
0,54006424,Clemens Tönnies
1,54006424,Gerald Asamoah
2,54006424,Hans Sarpei
3,54006424,Cacau
4,54003004,Angela Merkel
5,54003004,Emmanuel Macron
6,54003004,Olaf Scholz
7,54003004,Alexander Hamilton
8,54003004,Christine Lagarde
9,54003004,Ursula von der Leyen


Confirm result

In [642]:
[g for _, g in df.groupby(['id','name']) if len(g) > 1]

[]

In [643]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57269 entries, 0 to 57268
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      57269 non-null  int64 
 1   name    57269 non-null  object
dtypes: int64(1), object(1)
memory usage: 895.0+ KB


## Save

In [645]:
outputdir = Path.cwd()
filename = 'portuguese_articles_and_names_v3.csv'
df.to_csv(outputdir/filename, index=False)

## Read

In [646]:
inputdir = Path.cwd()
filename = 'portuguese_articles_and_names_v3.csv'
dtype={'id':int,
       'name':str
       }
df = pd.read_csv(inputdir/filename, dtype=dtype)

In [647]:
df

Unnamed: 0,id,name
0,54006424,Clemens Tönnies
1,54006424,Gerald Asamoah
2,54006424,Hans Sarpei
3,54006424,Cacau
4,54003004,Angela Merkel
...,...,...
57264,54006072,Paulo
57265,54006072,Anderson Correia
57266,54006072,Renato Feder
57267,54006072,Sérgio
