# **Import Libraries**

In [1]:
import re
import os
import random
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import xml.etree.ElementTree as ET

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# **Fetch the XML**

In [4]:
url = "https://www.europarl.europa.eu/meps/en/full-list/xml"

In [5]:
response = requests.get(url)

In [6]:
if response.status_code == 200:
    xml_data = response.text
    print('XML data fetched successfully!')
else:
    print('Failed to fetch XML:', response.statues_code)

XML data fetched successfully!


In [7]:
xml_data # xml_data is just a long string, not yet structured.

'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><meps><mep><fullName>Mika AALTOLA</fullName><country>Finland</country><politicalGroup>Group of the European People\'s Party (Christian Democrats)</politicalGroup><id>256810</id><nationalPoliticalGroup>Kansallinen Kokoomus</nationalPoliticalGroup></mep><mep><fullName>Maravillas ABADÍA JOVER</fullName><country>Spain</country><politicalGroup>Group of the European People\'s Party (Christian Democrats)</politicalGroup><id>257043</id><nationalPoliticalGroup>Partido Popular</nationalPoliticalGroup></mep><mep><fullName>Magdalena ADAMOWICZ</fullName><country>Poland</country><politicalGroup>Group of the European People\'s Party (Christian Democrats)</politicalGroup><id>197490</id><nationalPoliticalGroup>Independent</nationalPoliticalGroup></mep><mep><fullName>Georgios AFTIAS</fullName><country>Greece</country><politicalGroup>Group of the European People\'s Party (Christian Democrats)</politicalGroup><id>256820</id><nationalPoliticalGroup>Ne

# **Parsing the XML**

In [8]:
root = ET.fromstring(xml_data) ## Parse the XML string

In [9]:
print("Root tag:", root.tag) # Print the root tag to see the structure

Root tag: meps


# **XML Structure**

In [10]:
first_mep = root[0]  # first <mep> element
for child in first_mep:
    print(child.tag, ":", child.text)

fullName : Mika AALTOLA
country : Finland
politicalGroup : Group of the European People's Party (Christian Democrats)
id : 256810
nationalPoliticalGroup : Kansallinen Kokoomus


In [11]:
for mep in root[:5]:
    full_name = mep.findtext('fullName', default="Unknown")
    country = mep.findtext('country', default="Unknown")
    print(full_name, country)

Mika AALTOLA Finland
Maravillas ABADÍA JOVER Spain
Magdalena ADAMOWICZ Poland
Georgios AFTIAS Greece
Oihane AGIRREGOITIA MARTÍNEZ Spain


In [12]:
# full_name = "Oihane AGIRREGOITIA MARTÍNEZ"
# parts = full_name.split()
# first_name = parts[0]  # "Oihane"
# last_name = " ".join(parts[1:])  # "AGIRREGOITIA MARTÍNEZ"
# print(parts)
# print(first_name)
# print(last_name)

In [13]:
mep_data = []

for mep in root:
    full_name = mep.findtext('fullName', default="Unknown")
    country = mep.findtext('country', default="Unknown")
    
    name_parts = full_name.split()
    first_name = name_parts[0] if len(name_parts) > 0 else ''
    last_name = ' '.join(name_parts[1:]) if len(name_parts) > 1 else ''

    mep_data.append({
        'full_name': full_name,
        'first_name': first_name,
        'last_name': last_name,
        'country': country
    })

In [14]:
for person in mep_data[:5]:
    print(person)

{'full_name': 'Mika AALTOLA', 'first_name': 'Mika', 'last_name': 'AALTOLA', 'country': 'Finland'}
{'full_name': 'Maravillas ABADÍA JOVER', 'first_name': 'Maravillas', 'last_name': 'ABADÍA JOVER', 'country': 'Spain'}
{'full_name': 'Magdalena ADAMOWICZ', 'first_name': 'Magdalena', 'last_name': 'ADAMOWICZ', 'country': 'Poland'}
{'full_name': 'Georgios AFTIAS', 'first_name': 'Georgios', 'last_name': 'AFTIAS', 'country': 'Greece'}
{'full_name': 'Oihane AGIRREGOITIA MARTÍNEZ', 'first_name': 'Oihane', 'last_name': 'AGIRREGOITIA MARTÍNEZ', 'country': 'Spain'}


In [15]:
print("Total MEPs:", len(mep_data))

Total MEPs: 719


In [16]:
df = pd.DataFrame(mep_data)

In [17]:
df.sample(8)

Unnamed: 0,full_name,first_name,last_name,country
456,Rasmus NORDQVIST,Rasmus,NORDQVIST,Denmark
691,Séverine WERBROUCK,Séverine,WERBROUCK,France
297,Virginie JORON,Virginie,JORON,France
194,Luke Ming FLANAGAN,Luke,Ming FLANAGAN,Ireland
520,Diana RIBA I GINER,Diana,RIBA I GINER,Spain
273,Hannes HEIDE,Hannes,HEIDE,Austria
188,Jonás FERNÁNDEZ,Jonás,FERNÁNDEZ,Spain
393,Thierry MARIANI,Thierry,MARIANI,France


In [18]:
print("Total MEPs:", len(df))

Total MEPs: 719


In [19]:
X = df['full_name']
y = df['country']

In [20]:
len(X), len(y)

(719, 719)

In [21]:
y.value_counts() # very imbalanced data

country
Germany        96
France         81
Italy          76
Spain          60
Poland         53
Romania        33
Netherlands    31
Belgium        22
Portugal       21
Sweden         21
Greece         21
Czechia        21
Hungary        21
Austria        20
Bulgaria       17
Slovakia       15
Denmark        15
Finland        15
Ireland        14
Croatia        12
Lithuania      11
Slovenia        9
Latvia          9
Estonia         7
Malta           6
Luxembourg      6
Cyprus          6
Name: count, dtype: int64

# **Train_Test_Split**

In [22]:
X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, 
                                                              stratify = y, 
                                                              test_size = 0.2, 
                                                              random_state = 42)

In [23]:
print(y_train.value_counts())
print(y_test.value_counts())

country
Germany        77
France         65
Italy          61
Spain          48
Poland         42
Romania        26
Netherlands    25
Czechia        17
Belgium        17
Portugal       17
Greece         17
Sweden         17
Hungary        17
Austria        16
Bulgaria       13
Slovakia       12
Finland        12
Denmark        12
Ireland        11
Croatia         9
Lithuania       9
Latvia          7
Slovenia        7
Estonia         6
Luxembourg      5
Malta           5
Cyprus          5
Name: count, dtype: int64
country
Germany        19
France         16
Italy          15
Spain          12
Poland         11
Romania         7
Netherlands     6
Belgium         5
Sweden          4
Portugal        4
Hungary         4
Greece          4
Czechia         4
Bulgaria        4
Austria         4
Ireland         3
Finland         3
Slovakia        3
Croatia         3
Denmark         3
Lithuania       2
Latvia          2
Slovenia        2
Malta           1
Cyprus          1
Estonia         1
Luxe

# **Vectorization**

In [24]:
vectorizer = CountVectorizer()

In [25]:
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

In [26]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((575, 1115), (144, 1115), (575,), (144,))

In [27]:
len(vectorizer.vocabulary_)

1115

In [28]:
len(np.unique(y_train)), len(np.unique(y_test))

(27, 27)

# **Model Development**

In [29]:
model = MultinomialNB()

In [30]:
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [31]:
model.score(X_train, y_train)

0.7930434782608695

In [32]:
model.score(X_test, y_test)

0.2569444444444444

### **GridSearch**

In [79]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('model', MultinomialNB())
])

In [109]:
param_grid = {
    'vectorizer__analyzer': ['char'],  # only char
    'vectorizer__ngram_range': [(1,2), (1,5), (2,2), (2,3), (2,4), (3,3)],
    'vectorizer__min_df': [1, 2, 3],
    'model__alpha': [0.1, 0.5, 1.0, 1.5]
}

In [110]:
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='f1_macro',  # better for imbalanced classes
    n_jobs=-1
)

In [111]:
grid.fit(X_train_text, y_train)

0,1,2
,estimator,Pipeline(step...inomialNB())])
,param_grid,"{'model__alpha': [0.1, 0.5, ...], 'vectorizer__analyzer': ['char'], 'vectorizer__min_df': [1, 2, ...], 'vectorizer__ngram_range': [(1, ...), (1, ...), ...]}"
,scoring,'f1_macro'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(2, ...)"

0,1,2
,alpha,0.1
,force_alpha,True
,fit_prior,True
,class_prior,


In [112]:
print("Best params:", grid.best_params_)

Best params: {'model__alpha': 0.1, 'vectorizer__analyzer': 'char', 'vectorizer__min_df': 1, 'vectorizer__ngram_range': (2, 4)}


In [113]:
print("Best CV F1 macro score:", grid.best_score_)

Best CV F1 macro score: 0.29062135820424156


In [114]:
print("Test accuracy:", grid.score(X_test_text, y_test))

Test accuracy: 0.23032605254827476


In [115]:
y_pred = grid.predict(X_test_text)
print("Test F1 (macro):", f1_score(y_test, y_pred, average='macro'))

Test F1 (macro): 0.23032605254827476
