In [4]:
from lxml import etree
import pandas as pd

# Fetch data

In [5]:
data_path = "data"

## Organizations

# Italy

In [6]:
def get_country_parliamentary_groups(root, language):
    info = []
    for elem in root.findall(
        "./org[@role='parliamentaryGroup']", namespaces=root.nsmap
    ):
        org_id = "".join(("#", elem.values()[-1]))
        org_abb = elem.findall("./orgName[@full='abb']", namespaces=root.nsmap)[0].text
        org_name = elem.findall("./orgName[@full='yes']", namespaces=root.nsmap)[
            -1
        ].text
        subelem = elem.findall(".//state/state", namespaces=root.nsmap)
        if subelem:
            orientation = subelem[0].get("ana")
        else:
            orientation = None
        info.append(
            {
                "abbreviations": org_abb,
                "party_id": org_id,
                "full_name": org_name,
                "orientation": orientation,
                "language": language,
            }
        )

    return pd.DataFrame(info)

In [7]:
language = "IT"
tree_it = etree.parse(
    f"{data_path}/ParlaMint-{language}-en.ana/ParlaMint-{language}-en.TEI.ana/ParlaMint-{language}-listOrg.xml"
)
root_it = tree_it.getroot()

df_it = get_country_parliamentary_groups(root_it, language)
df_it

Unnamed: 0,abbreviations,party_id,full_name,orientation,language
0,LN-Aut,#group.LN-Aut,Lega Nord e Autonomie,#orientation.RRF,IT
1,"GAL (GS, PpI, M, Id, E-E, MPL, RI)",#group.GAL.GS.PpI.M.Id.E-E.MPL.RI,"Grandi Autonomie e Libertà (Grande Sud, Popola...",#orientation.CR,IT
2,PD,#group.PD,Partito Democratico,#orientation.CL,IT
3,FIBP-UDC,#group.FIBP-UDC,Forza Italia Berlusconi Presidente-UDC,#orientation.CR,IT
4,NcI,#group.NcI,Noi con l'Italia,#orientation.CR,IT
5,ALA-PRI,#group.ALA-PRI,ALA (Alleanza Liberal Popolare) - PRI (Partito...,#orientation.C,IT
6,CoR,#group.CoR,Conservatori e Riformisti,#orientation.CR,IT
7,"GAL (GS, PpI, FV, M)",#group.GAL.GS.PpI.FV.M,"Grandi Autonomie e Libertà (Grande Sud, Popola...",#orientation.CR,IT
8,IpF-CD,#group.IpF-CD,Insieme per il futuro - Centro Democratico,#orientation.C,IT
9,SCpI,#group.SCpI,Scelta Civica per l'Italia,#orientation.C,IT


## Hungary

In [8]:
language = "HU"
tree_hu = etree.parse(
    f"{data_path}/ParlaMint-{language}-en.ana/ParlaMint-{language}-en.TEI.ana/ParlaMint-{language}-listOrg.xml"
)
root_hu = tree_hu.getroot()

df_hu = get_country_parliamentary_groups(root_hu, language)

In [9]:
df_hu

Unnamed: 0,abbreviations,party_id,full_name,orientation,language
0,Fidesz-frakció,#parliamentaryGroup,Parliamentary group of the Fidesz – Hungarian ...,#orientation.RRF,HU
1,KNDP-frakció,#parliamentaryGroup,Parliamentary group of the Christian Democrati...,#orientation.R,HU
2,MSZP-frakció,#parliamentaryGroup,Parliamentary group of the Hungarian Socialist...,#orientation.CL,HU
3,JOBBIK-frakció,#parliamentaryGroup,Parliamentary group of the Movement for a Bett...,#orientation.CR,HU
4,LMP-frakció,#parliamentaryGroup,Parliamentary group of the Politics Can be Dif...,#orientation.CCL,HU
5,DK-frakció,#parliamentaryGroup,Parliamentary group of the Democratic Coalitio...,#orientation.CL,HU
6,Párbeszéd-frakció,#parliamentaryGroup,Parliamentary group of the Dialogue for Hungar...,#orientation.CLL,HU
7,Fidesz-frakció,#parliamentaryGroup,Parliamentary group of the Fidesz – Hungarian ...,#orientation.RRF,HU
8,KNDP-frakció,#parliamentaryGroup,Parliamentary group of the Christian Democrati...,#orientation.R,HU
9,MSZP-frakció,#parliamentaryGroup,Parliamentary group of the Hungarian Socialist...,#orientation.CL,HU


In [10]:
language = "GR"
tree_gr = etree.parse(
    f"{data_path}/ParlaMint-{language}-en.ana/ParlaMint-{language}-en.TEI.ana/ParlaMint-{language}-listOrg.xml"
)
root_gr = tree_gr.getroot()

df_gr = get_country_parliamentary_groups(root_gr, language)
df_gr

Unnamed: 0,abbreviations,party_id,full_name,orientation,language
0,Ν.Δ.,#parliamentaryGroup,New Democracy,#orientation.CR,GR
1,ΑΝ.ΕΛ.,#parliamentaryGroup,Independent Greeks National Patriotic Alliance,#orientation.R,GR
2,ΔΗ.ΣΥ.,#parliamentaryGroup,Democratic Alignment,#orientation.CL,GR
3,Ε.Λ.,#parliamentaryGroup,Greek Solution,#orientation.RRF,GR
4,Ε.Κ.,#parliamentaryGroup,Union of Centrists,#orientation.C,GR
5,ΚΙΝ.ΑΛ.,#parliamentaryGroup,Movement for Change,#orientation.CL,GR
6,Κ.Κ.Ε.,#parliamentaryGroup,Communist Party of Greece,,GR
7,ΛΑ.Ε.,#parliamentaryGroup,Popular Unity,#orientation.LLF,GR
8,Χ.Α.,#parliamentaryGroup,Popular Association – Golden Dawn,#orientation.FR,GR
9,ΜέΡΑ25,#parliamentaryGroup,European Realistic Disobedience Front,#orientation.L,GR


In [11]:
language = "ES"
tree_es = etree.parse(
    f"{data_path}/ParlaMint-{language}-en.ana/ParlaMint-{language}-en.TEI.ana/ParlaMint-{language}-listOrg.xml"
)
root_es = tree_es.getroot()

df_es = get_country_parliamentary_groups(root_es, language)
df_es

Unnamed: 0,abbreviations,party_id,full_name,orientation,language
0,PSC(PSC-PSOE),#party.PSCPSCPSOE,Partido de los Socialistas de Cataluña,#orientation.CL,ES
1,PSOEdeAndalucía,#party.PSOEdeAndalucía,Partido Socialista Obrero Español de Andalucía,#orientation.CL,ES
2,PP-EU,#party.PPEU,Partido Popular,#orientation.CRR,ES
3,EUiA,#party.EUiA,Esquerra Unida i Alternativa,#orientation.L,ES
4,CC-NC-PNC,#party.CCNCPNC,Coalición Canaria-Nueva Canaria-Partido Nacion...,#orientation.CCR,ES
5,Compromís-Q,#party.COMPROMÍSQ,Comprommís-Q,,ES
6,ERC-RI.cat,#party.ERCRIcat,Esquerra Republicana de Catalunya - Reagrupame...,#orientation.CLL,ES
7,EUPV,#party.EUPV,Esquerra Unida del País Valencià,#orientation.L,ES
8,GB,#party.GB,Geroa Bai,#orientation.CL,ES
9,ICV,#party.ICV,Iniciativa per Catalunya Verds,#orientation.L,ES


In [12]:
parties = pd.concat([df_es, df_it, df_hu, df_gr])
parties.to_csv(f"{data_path}/parties.csv", index=False)

In [13]:
parties

Unnamed: 0,abbreviations,party_id,full_name,orientation,language
0,PSC(PSC-PSOE),#party.PSCPSCPSOE,Partido de los Socialistas de Cataluña,#orientation.CL,ES
1,PSOEdeAndalucía,#party.PSOEdeAndalucía,Partido Socialista Obrero Español de Andalucía,#orientation.CL,ES
2,PP-EU,#party.PPEU,Partido Popular,#orientation.CRR,ES
3,EUiA,#party.EUiA,Esquerra Unida i Alternativa,#orientation.L,ES
4,CC-NC-PNC,#party.CCNCPNC,Coalición Canaria-Nueva Canaria-Partido Nacion...,#orientation.CCR,ES
...,...,...,...,...,...
8,Χ.Α.,#parliamentaryGroup,Popular Association – Golden Dawn,#orientation.FR,GR
9,ΜέΡΑ25,#parliamentaryGroup,European Realistic Disobedience Front,#orientation.L,GR
10,ΠΑ.ΣΟ.Κ.,#parliamentaryGroup,Panhellenic Socialist Movement,#orientation.CL,GR
11,ΣΥ.ΡΙΖ.Α.,#parliamentaryGroup,Coalition of the Radical Left – Progressive Al...,#orientation.CLL,GR


In [14]:
len(parties)

129

In [15]:
far_right = parties[parties.orientation.isin(["#orientation.FR", "#orientation.RRF"])]

len(far_right)
print(len(far_right))
far_right  # .to_csv('far_right.csv',index=False)

12


Unnamed: 0,abbreviations,party_id,full_name,orientation,language
49,Vox,#party.Vox,Vox,#orientation.RRF,ES
0,LN-Aut,#group.LN-Aut,Lega Nord e Autonomie,#orientation.RRF,IT
13,FdI,#group.FdI,Fratelli d'Italia,#orientation.RRF,IT
25,L-SP,#group.L-SP,Lega-Salvini Premier,#orientation.RRF,IT
32,L-SP-PSd'Az,#group.L-SP-PSd.Az,Lega-Salvini Premier-Partito Sardo d'Azione,#orientation.RRF,IT
39,PI,#group.PI,Per l'Italia,#orientation.FR,IT
0,Fidesz-frakció,#parliamentaryGroup,Parliamentary group of the Fidesz – Hungarian ...,#orientation.RRF,HU
7,Fidesz-frakció,#parliamentaryGroup,Parliamentary group of the Fidesz – Hungarian ...,#orientation.RRF,HU
12,Fidesz-frakció,#parliamentaryGroup,Parliamentary group of the Fidesz – Hungarian ...,#orientation.RRF,HU
20,Mi Hazánk-frakció,#parliamentaryGroup,Parliamentary group of the Our Homeland Moveme...,#orientation.FR,HU


In [16]:
parties[parties.language == "ES"]

Unnamed: 0,abbreviations,party_id,full_name,orientation,language
0,PSC(PSC-PSOE),#party.PSCPSCPSOE,Partido de los Socialistas de Cataluña,#orientation.CL,ES
1,PSOEdeAndalucía,#party.PSOEdeAndalucía,Partido Socialista Obrero Español de Andalucía,#orientation.CL,ES
2,PP-EU,#party.PPEU,Partido Popular,#orientation.CRR,ES
3,EUiA,#party.EUiA,Esquerra Unida i Alternativa,#orientation.L,ES
4,CC-NC-PNC,#party.CCNCPNC,Coalición Canaria-Nueva Canaria-Partido Nacion...,#orientation.CCR,ES
5,Compromís-Q,#party.COMPROMÍSQ,Comprommís-Q,,ES
6,ERC-RI.cat,#party.ERCRIcat,Esquerra Republicana de Catalunya - Reagrupame...,#orientation.CLL,ES
7,EUPV,#party.EUPV,Esquerra Unida del País Valencià,#orientation.L,ES
8,GB,#party.GB,Geroa Bai,#orientation.CL,ES
9,ICV,#party.ICV,Iniciativa per Catalunya Verds,#orientation.L,ES


In [17]:
parties

Unnamed: 0,abbreviations,party_id,full_name,orientation,language
0,PSC(PSC-PSOE),#party.PSCPSCPSOE,Partido de los Socialistas de Cataluña,#orientation.CL,ES
1,PSOEdeAndalucía,#party.PSOEdeAndalucía,Partido Socialista Obrero Español de Andalucía,#orientation.CL,ES
2,PP-EU,#party.PPEU,Partido Popular,#orientation.CRR,ES
3,EUiA,#party.EUiA,Esquerra Unida i Alternativa,#orientation.L,ES
4,CC-NC-PNC,#party.CCNCPNC,Coalición Canaria-Nueva Canaria-Partido Nacion...,#orientation.CCR,ES
...,...,...,...,...,...
8,Χ.Α.,#parliamentaryGroup,Popular Association – Golden Dawn,#orientation.FR,GR
9,ΜέΡΑ25,#parliamentaryGroup,European Realistic Disobedience Front,#orientation.L,GR
10,ΠΑ.ΣΟ.Κ.,#parliamentaryGroup,Panhellenic Socialist Movement,#orientation.CL,GR
11,ΣΥ.ΡΙΖ.Α.,#parliamentaryGroup,Coalition of the Radical Left – Progressive Al...,#orientation.CLL,GR


# Speakers info

## Spain

In [18]:
def get_info_people(root):
    info_people = []

    for elem in root.findall(".//person", namespaces=root.nsmap):
        name = elem.values()[0]
        try:
            sex = elem.find(".//sex", namespaces=elem.nsmap).get("value")
        except:
            sex = None
        birth = (
            elem.find(".//birth", namespaces=elem.nsmap).get("when")
            if elem.find(".//birth", namespaces=elem.nsmap)
            else None
        )
        party = (
            elem.findall(".//affiliation", namespaces=elem.nsmap)[-1].get("ref")
            if elem.findall(".//affiliation", namespaces=elem.nsmap)
            else None
        )

        info_people.append({"name": name, "sex": sex, "birth": birth, "party": party})

    people_df = pd.DataFrame(info_people)
    return people_df

In [19]:
language = "ES"
file_person = f"{data_path}/ParlaMint-{language}-en.ana/ParlaMint-{language}-en.TEI.ana/ParlaMint-{language}-listPerson.xml"
tree_es = etree.parse(file_person)
root_es = tree_es.getroot()
people_es = get_info_people(root_es)

  birth = elem.find('.//birth', namespaces=elem.nsmap).get('when') if elem.find('.//birth', namespaces=elem.nsmap) else None


In [20]:
people_es

Unnamed: 0,name,sex,birth,party
0,AdolfoPérezAbellás,M,1952-09-25,#party.PsdeGPSOE
1,AdolfoSuárezIllana,M,,#party.PP
2,AdrianaLastraFernández,F,1979-03-30,#party.PSOE
3,AgustínAlmodóbarBarceló,M,,#party.PP
4,AgustínCondeBajén,M,,#party.PP
...,...,...,...,...
936,ÍñigodelaSerna,,,#GOV
937,ÓscarClavellLópez,M,,#party.PP
938,ÓscarGaleanoGracia,M,,#party.PSOE
939,ÓscarGamazoMicó,M,1976-04-17,#party.PP


## Italy

In [21]:
language = "IT"
file_person = f"{data_path}/ParlaMint-{language}-en.ana/ParlaMint-{language}-en.TEI.ana/ParlaMint-{language}-listPerson.xml"
tree_it = etree.parse(file_person)
root_it = tree_it.getroot()
people_it = get_info_people(root_it)
people_it

  birth = elem.find('.//birth', namespaces=elem.nsmap).get('when') if elem.find('.//birth', namespaces=elem.nsmap) else None


Unnamed: 0,name,sex,birth,party
0,BongiornoGiulia,F,1966-03-22,#GOV
1,ParagoneGianluigi,M,1971-08-07,#group.Misto
2,BocciGianpiero,M,1962-08-10,#GOV
3,PerilliGianluca,M,1973-12-05,#group.M5S.2
4,MarcucciAndrea,M,1965-05-28,#group.PD
...,...,...,...,...
766,PinottiRoberta,F,1961-05-20,#GOV
767,MinasiTilde,F,1960-07-24,#group.L-SP-PSd.Az
768,BerrettaGiuseppe,F,1970-06-11,#GOV
769,NenciniRiccardo,M,1959-10-19,#GOV


## Hungary

In [22]:
language = "HU"
file_person = f"{data_path}/ParlaMint-{language}-en.ana/ParlaMint-{language}-en.TEI.ana/ParlaMint-{language}-listPerson.xml"
tree_hu = etree.parse(file_person)
root_hu = tree_hu.getroot()
people_hu = get_info_people(root_hu)
people_hu

  birth = elem.find('.//birth', namespaces=elem.nsmap).get('when') if elem.find('.//birth', namespaces=elem.nsmap) else None


Unnamed: 0,name,sex,birth,party
0,AderJanos,M,,#REP
1,AghPeter,M,,#ministry.EpitesKozl
2,AlexovLyubomir,M,,#org.MNemzB
3,AnderBalazs,M,,#org.NOB
4,ApatiIstvan,M,,#org.VB
...,...,...,...,...
487,ZavogyanMagdolna,F,,#ministry.KIM
488,LanszkiRegoBalazs,M,,#ministry.EpitesKozl
489,SzalayBobrovniczkyKristof,M,,#GOV
490,IbolyaTiborBela,M,,#org.LU


## Greece

In [23]:
language = "GR"
file_person = f"{data_path}/ParlaMint-{language}-en.ana/ParlaMint-{language}-en.TEI.ana/ParlaMint-{language}-listPerson.xml"
tree_gr = etree.parse(file_person)
root_gr = tree_gr.getroot()
people_gr = get_info_people(root_gr)
people_gr

Unnamed: 0,name,sex,birth,party
0,ΝΙΚΟΛΟΠΟΥΛΟΣ_ΙΩΑΝΝΟΥ_ΝΙΚΟΛΑΟΣ,M,,#PoGR
1,ΗΓΟΥΜΕΝΙΔΗΣ_ΕΜΜΑΝΟΥΗΛ_ΝΙΚΟΛΑΟΣ,M,,#PoGR
2,ΑΘΑΝΑΣΙΟΣ_ΗΛΙΟΠΟΥΛΟΣ,M,,#GOV
3,ΡΙΖΟΥΛΗΣ_ΚΩΝΣΤΑΝΤΙΝΟΥ_ΑΝΔΡΕΑΣ,M,,#PoGR
4,ΣΗΦΑΚΗΣ_ΓΕΩΡΓΙΟΥ_ΙΩΑΝΝΗΣ,M,,#PoGR
...,...,...,...,...
630,ΙΩΑΝΝΗΣ_ΣΤΟΥΡΝΑΡΑΣ,M,,
631,ΙΕΡΕΥΣ,M,,
632,ΑΝΔΡΕΑΣ_ΛΥΚΟΥΡΕΝΤΖΟΣ,M,,
633,FRANÇOIS_HOLLANDE,M,,


In [24]:
speakers = pd.concat([people_es, people_it, people_hu, people_gr])
speakers.to_csv(f"{data_path}/people.csv", index=False)

## Speeches Extraction

In [38]:
import os
from collections import defaultdict

In [62]:
def get_speeches_from_document(root, speeches, file):
    speech = []
    speaker = "UNK"

    for elem in root.findall(".//", namespaces=root.nsmap):
        if elem.tag == "{http://www.tei-c.org/ns/1.0}w":
            speech.append(elem.text)
        if elem.tag == "{http://www.tei-c.org/ns/1.0}u":
            previous_speaker = speaker
            speeches[previous_speaker].append(" ".join(speech))
            speaker = elem.get("who")
            speech = []
    return speeches

In [63]:
language = "ES"

In [64]:
# file_speech = f"{data_path}/ParlaMint-{language}-en.ana/ParlaMint-{language}-en.TEI.ana/2021/ParlaMint-{language}-en_2021-02-02-CD210202.ana.xml"
# tree = etree.parse(file_speech)
# root = tree.getroot()

In [66]:
def get_speeches_country(language):
    for year in ["2021", "2022"]:
        files_folder = f"{data_path}/ParlaMint-{language}-en.ana/ParlaMint-{language}-en.TEI.ana/{year}"
        files = os.listdir(files_folder)

        speeches = defaultdict(list)

        for file in files:
            tree = etree.parse(f"{files_folder}/{file}")
            root = tree.getroot()
            try:
                speeches = get_speeches_from_document(root, speeches, file)
            except:
                print(f"{file} failed :0")
    return speeches

In [67]:
speeches_es = get_speeches_country("ES")

In [68]:
speeches_it = get_speeches_country("IT")

In [69]:
speeches_gr = get_speeches_country("GR")

In [70]:
speeches_hu = get_speeches_country("HU")

In [71]:
parties

Unnamed: 0,abbreviations,party_id,full_name,orientation,language
0,PSC(PSC-PSOE),#party.PSCPSCPSOE,Partido de los Socialistas de Cataluña,#orientation.CL,ES
1,PSOEdeAndalucía,#party.PSOEdeAndalucía,Partido Socialista Obrero Español de Andalucía,#orientation.CL,ES
2,PP-EU,#party.PPEU,Partido Popular,#orientation.CRR,ES
3,EUiA,#party.EUiA,Esquerra Unida i Alternativa,#orientation.L,ES
4,CC-NC-PNC,#party.CCNCPNC,Coalición Canaria-Nueva Canaria-Partido Nacion...,#orientation.CCR,ES
...,...,...,...,...,...
8,Χ.Α.,#parliamentaryGroup,Popular Association – Golden Dawn,#orientation.FR,GR
9,ΜέΡΑ25,#parliamentaryGroup,European Realistic Disobedience Front,#orientation.L,GR
10,ΠΑ.ΣΟ.Κ.,#parliamentaryGroup,Panhellenic Socialist Movement,#orientation.CL,GR
11,ΣΥ.ΡΙΖ.Α.,#parliamentaryGroup,Coalition of the Radical Left – Progressive Al...,#orientation.CLL,GR


In [74]:
speakers = speakers.rename(columns={"party": "party_id"})

In [77]:
speakers_party_info = speakers.merge(parties, how="inner", on=["party_id"])

In [80]:
alt_right_speakers_info = speakers_party_info[
    speakers_party_info.orientation.isin(["#orientation.FR", "#orientation.RRF"])
]

In [82]:
alt_right_speakers_info["name_id"] = alt_right_speakers_info["name"].map(
    lambda name: "".join(("#", name))
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alt_right_speakers_info['name_id'] = alt_right_speakers_info['name'].map(lambda name: ''.join(('#', name)))


In [84]:
alt_right_speakers_info.to_csv("alt_right_speakers_info.csv", index=False)

In [87]:
alt_right_speakers_info.groupby(["language", "orientation", "sex", "party_id"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,name,birth,abbreviations,full_name,name_id
language,orientation,sex,party_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ES,#orientation.RRF,F,#party.Vox,15,4,15,15,15
ES,#orientation.RRF,M,#party.Vox,40,17,40,40,40
IT,#orientation.RRF,F,#group.FdI,4,4,4,4,4
IT,#orientation.RRF,F,#group.L-SP,11,11,11,11,11
IT,#orientation.RRF,F,#group.L-SP-PSd.Az,6,6,6,6,6
IT,#orientation.RRF,F,#group.LN-Aut,1,1,1,1,1
IT,#orientation.RRF,M,#group.FdI,19,19,19,19,19
IT,#orientation.RRF,M,#group.L-SP,33,33,33,33,33
IT,#orientation.RRF,M,#group.L-SP-PSd.Az,3,3,3,3,3
IT,#orientation.RRF,M,#group.LN-Aut,4,4,4,4,4


In [90]:
parties[parties.language == "GR"]

Unnamed: 0,abbreviations,party_id,full_name,orientation,language
0,Ν.Δ.,#parliamentaryGroup,New Democracy,#orientation.CR,GR
1,ΑΝ.ΕΛ.,#parliamentaryGroup,Independent Greeks National Patriotic Alliance,#orientation.R,GR
2,ΔΗ.ΣΥ.,#parliamentaryGroup,Democratic Alignment,#orientation.CL,GR
3,Ε.Λ.,#parliamentaryGroup,Greek Solution,#orientation.RRF,GR
4,Ε.Κ.,#parliamentaryGroup,Union of Centrists,#orientation.C,GR
5,ΚΙΝ.ΑΛ.,#parliamentaryGroup,Movement for Change,#orientation.CL,GR
6,Κ.Κ.Ε.,#parliamentaryGroup,Communist Party of Greece,,GR
7,ΛΑ.Ε.,#parliamentaryGroup,Popular Unity,#orientation.LLF,GR
8,Χ.Α.,#parliamentaryGroup,Popular Association – Golden Dawn,#orientation.FR,GR
9,ΜέΡΑ25,#parliamentaryGroup,European Realistic Disobedience Front,#orientation.L,GR


In [91]:
parties[parties.language == "HU"]

Unnamed: 0,abbreviations,party_id,full_name,orientation,language
0,Fidesz-frakció,#parliamentaryGroup,Parliamentary group of the Fidesz – Hungarian ...,#orientation.RRF,HU
1,KNDP-frakció,#parliamentaryGroup,Parliamentary group of the Christian Democrati...,#orientation.R,HU
2,MSZP-frakció,#parliamentaryGroup,Parliamentary group of the Hungarian Socialist...,#orientation.CL,HU
3,JOBBIK-frakció,#parliamentaryGroup,Parliamentary group of the Movement for a Bett...,#orientation.CR,HU
4,LMP-frakció,#parliamentaryGroup,Parliamentary group of the Politics Can be Dif...,#orientation.CCL,HU
5,DK-frakció,#parliamentaryGroup,Parliamentary group of the Democratic Coalitio...,#orientation.CL,HU
6,Párbeszéd-frakció,#parliamentaryGroup,Parliamentary group of the Dialogue for Hungar...,#orientation.CLL,HU
7,Fidesz-frakció,#parliamentaryGroup,Parliamentary group of the Fidesz – Hungarian ...,#orientation.RRF,HU
8,KNDP-frakció,#parliamentaryGroup,Parliamentary group of the Christian Democrati...,#orientation.R,HU
9,MSZP-frakció,#parliamentaryGroup,Parliamentary group of the Hungarian Socialist...,#orientation.CL,HU


In [100]:
alt_right_speakers = set(alt_right_speakers_info["name_id"])

In [109]:
all_speeches = dict()

In [122]:
for speaker, speeches in speeches_es.items():
    if speaker in alt_right_speakers:
        all_speeches[speaker] = speeches

for speaker, speeches in speeches_it.items():
    if speaker in alt_right_speakers:
        all_speeches[speaker] = speeches

In [123]:
len(all_speeches)

108

In [126]:
df = pd.DataFrame.from_dict(
    {"speaker": all_speeches.keys(), "speeches": all_speeches.values()}
)
df = df.explode("speeches").reset_index(drop=True)

In [135]:
df

Unnamed: 0,speaker,speeches
0,#MireiaBorrásPabón,Good afternoon ladies and gentlemen Mr Echeniq...
1,#MireiaBorrásPabón,Thank you President Good morning ladies and ge...
2,#MireiaBorrásPabón,Thank you President Good afternoon deputies Th...
3,#MireiaBorrásPabón,Thank you President Good afternoon ladies and ...
4,#MireiaBorrásPabón,In what part of the menu are the worms and pro...
...,...,...
2980,#BossiSimone,BOSSI Simone L SP PSd Az Mr President unfortun...
2981,#BossiSimone,Mr President Mr Undersecretary ladies and gent...
2982,#PetrengaGiovanna,I ask you to speak for an explanation of vote
2983,#PetrengaGiovanna,PETRENGA FdI Mr President today we are debatin...


In [136]:
additional_info = alt_right_speakers_info[
    ["party_id", "full_name", "name_id", "language", "orientation", "sex"]
].rename(columns={"name_id": "speaker"})

In [137]:
additional_info

Unnamed: 0,party_id,full_name,speaker,language,orientation,sex
5,#party.Vox,Vox,#AgustínRosetyFernándezDeCastro,ES,#orientation.RRF,M
11,#party.Vox,Vox,#AlbertoAsartaCuevas,ES,#orientation.RRF,M
45,#party.Vox,Vox,#AndrésAlbertoRodríguezAlmeida,ES,#orientation.RRF,M
67,#party.Vox,Vox,#AntonioSalváVerd,ES,#orientation.RRF,M
85,#party.Vox,Vox,#CarlaToscanoDeBalbín,ES,#orientation.RRF,F
...,...,...,...,...,...,...
1295,#group.FdI,Fratelli d'Italia,#BalboniAlberto,IT,#orientation.RRF,M
1299,#group.L-SP,Lega-Salvini Premier,#RomeoMassimiliano,IT,#orientation.RRF,M
1303,#group.L-SP,Lega-Salvini Premier,#RipamontiPaolo,IT,#orientation.RRF,M
1313,#group.FdI,Fratelli d'Italia,#MalanLucio,IT,#orientation.RRF,M


In [141]:
df.merge(additional_info, how="inner", on="speaker").rename(
    columns={"speaker": "speaker_id", "language": "county", "full_name": "party_name"}
).to_csv("preprocessed_data.csv", index=False)