# Serbian Parliament NLP analysis

## Setup & Imports

In [1]:
# Imports
import json
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import logging
from gensim import corpora, models
import stanza
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import seaborn as sns
import pandas as pd
import os
import sys
from pathlib import Path

In [2]:
#os.chdir("/home/vuk/Documents/0 Data Science/serbian_parliament_nlp_analysis")

In [3]:
#project_root = Path().resolve().parent
#sys.path.append(str(project_root))

In [5]:
project_root = Path().resolve().parent
sys.path.append(str(project_root))

In [6]:
import config

In [8]:
# Stanza pipeline
nlp = stanza.Pipeline("sr", processors="tokenize,pos,lemma", use_gpu=False)

2025-05-11 11:56:23 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-11 11:56:24 INFO: Downloaded file to /home/vuk/stanza_resources/resources.json
2025-05-11 11:56:24 INFO: Loading these models for language: sr (Serbian):
| Processor | Package      |
----------------------------
| tokenize  | set          |
| pos       | set_nocharlm |
| lemma     | set_nocharlm |

2025-05-11 11:56:24 INFO: Using device: cpu
2025-05-11 11:56:24 INFO: Loading: tokenize
2025-05-11 11:56:24 INFO: Loading: pos
2025-05-11 11:56:25 INFO: Loading: lemma
2025-05-11 11:56:25 INFO: Done loading processors!


## Data Loading

In [9]:
def load_data(path):
    try:
        with open(path, encoding="utf-8") as f:
            data = json.load(f)
        logging.info(f"Loaded {len(data)} records from {path}")
        return pd.DataFrame(data)
    except FileNotFoundError:
        logging.error(f"File not found: {path}")
        return pd.DataFrame()

In [10]:
def load_stopwords(path):
    with open(path, encoding="utf-8") as f:
        return set(line.strip().lower() for line in f if line.strip())

In [11]:
# Load
df = pd.read_json(config.SPEECHES_JSON)
stopwords = load_stopwords(config.STOPWORDS_TXT)

## Preprocessing

In [12]:
def tokenize_and_lemmatize(text, stopwords):
    doc = nlp(text)
    return [
        word.lemma.lower()
        for sent in doc.sentences
        for word in sent.words
        if word.lemma and word.lemma.lower() not in stopwords and len(word.lemma) > 3
    ]

In [13]:
# Basic Cleaning
df['speech'] = df['speech'].astype(str)
df['speaker'] = df['speaker'].astype(str)
df['speech_len'] = df['speech'].str.len()

In [None]:
# Tokenize and Clean
df['clean_tokens'] = df['speech'].apply(lambda x: tokenize_and_lemmatize(x, stopwords))
df['clean_text'] = df['clean_tokens'].apply(lambda tokens: ' '.join(tokens))

In [None]:
df.to_json(config.LEMMATIZED_JSON, orient="records", force_ascii=False)

## Visualizations

In [None]:
def plot_wordcloud(frequencies, title='Most Common Non-Stop Words'):
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(frequencies)
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

In [None]:
def plot_histogram(data, title, xlabel, ylabel, bins=20, xlim=None):
    plt.figure(figsize=(8, 5))
    data.hist(bins=bins, color='skyblue')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    if xlim:
        plt.xlim(*xlim)
    plt.grid(True)
    plt.show()

## WordCloud + Length Distribution

In [None]:
# Word Frequency
word_freq = Counter()
df['clean_tokens'].apply(lambda tokens: word_freq.update(tokens))

In [None]:
df['speech_len'] = df['speech'].str.len()

In [None]:
# Create the word cloud
wordcloud = WordCloud(
    background_color="white",
    max_words=500,
    colormap="viridis"
).generate_from_frequencies(word_freq)

# Plot and save
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout()
plt.savefig("/home/vuk/Documents/0 Data Science/parliament_nlp_analysis/03 figures/wordcloud_serbia.png", dpi=300)
plt.show()

In [None]:
word_freq_df = pd.DataFrame(word_freq.items(), columns=['word', 'frequency'])

In [None]:
word_freq_df = pd.DataFrame(word_freq.items(), columns=['word', 'frequency'])

# Sort by frequency descending
top_words = word_freq_df.sort_values(by='frequency', ascending=False)

# Display top 20
top_words.head(20)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=top_words.head(20), x='frequency', y='word', color='skyblue')
plt.title("Top 20 Most Frequent Words")
plt.xlabel("Frequency")
plt.ylabel("Word")
plt.tight_layout()
plt.savefig("/home/vuk/Documents/0 Data Science/parliament_nlp_analysis/03 figures/top_words_serbia.png", dpi=300)
plt.show()

In [None]:
word_freq_df.to_csv("/home/vuk/Documents/0 Data Science/parliament_nlp_analysis/0 data/02 interim/word_frequencies.csv", index=False)

In [None]:
if 'speech_len' not in df.columns:
    df['speech_len'] = df['speech'].str.len()

# Keep only needed columns (optional)
speech_len_df = df[['speaker', 'speech_len']]

In [None]:
speech_len_df.to_csv("/home/vuk/Documents/0 Data Science/parliament_nlp_analysis/0 data/02 interim/speech_lengths.csv", index=False)

In [None]:
plot_histogram(df['speech_len'], 'Distribution of Speech Lengths', 'Speech Length (Characters)', 'Number of Speeches', xlim=(0, 30000))
plt.savefig("/home/vuk/Documents/0 Data Science/parliament_nlp_analysis/03 figures/speech_lengths.png", dpi=300)
plt.show()

In [None]:
top_speakers_by_count = df['speaker'].value_counts().reset_index()
top_speakers_by_count.columns = ['speaker', 'num_speeches']
print(top_speakers_by_count.head(15))

In [None]:
top_speakers_by_length = df.groupby('speaker')['speech_len'].sum().sort_values(ascending=False).reset_index()
top_speakers_by_length.columns = ['speaker', 'total_speech_len']
print(top_speakers_by_length.head(15))

In [None]:
speaker_stats = df.groupby('speaker').agg(
    num_speeches=('speech', 'count'),
    total_speech_len=('speech_len', 'sum'),
    avg_speech_len=('speech_len', 'mean')
).sort_values(by='num_speeches', ascending=False).reset_index()

print(speaker_stats.head(15))

In [None]:
top10 = speaker_stats.head(15)

plt.figure(figsize=(10, 6))
sns.barplot(data=top10, x='num_speeches', y='speaker', color='skyblue')
plt.title("Top 10 Speakers by Number of Speeches")
plt.xlabel("Number of Speeches")
plt.tight_layout()
plt.savefig("/home/vuk/Documents/0 Data Science/parliament_nlp_analysis/03 figures/top_speakers.png", dpi=300)
plt.show()

In [None]:
# Top 15 verbose speakers
top15_length = top_speakers_by_length.head(15)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=top15_length, x='total_speech_len', y='speaker', color='mediumseagreen')
plt.title("Top 15 Speakers by Total Speech Length")
plt.xlabel("Total Speech Length (Characters or Words)")
plt.ylabel("Speaker")
plt.tight_layout()
plt.savefig("/home/vuk/Documents/0 Data Science/parliament_nlp_analysis/03 figures/top_speakers_2.png", dpi=300)
plt.show()

## Topic Modeling

In [None]:
import logging
logging.getLogger('gensim').setLevel(logging.WARNING)

In [None]:
def topic_modeling(texts, num_topics=10):
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)
    return lda, dictionary, corpus

In [None]:
lda_model, dictionary, corpus = topic_modeling(df['clean_tokens'].tolist(), NUM_TOPICS)

# Print topics
for idx, topic in lda_model.print_topics(-1, num_words=10):
    print(f"Topic #{idx}:")
    for term in topic.split(" + "):
        weight, word = term.split("*")
        word_clean = word.strip().strip('"')
        print(f"  {float(weight):.3f} {word_clean}")
    print()

## Sentiment Analysis

In [None]:
sentiment_df = pd.read_csv("/home/vuk/Documents/0 Data Science/parliament_nlp_analysis/0 data/03 external/serbian_sentiment_latin.csv")

In [None]:
sentiment_df = sentiment_df.drop_duplicates(subset=["Serbian Word"])
sentiment_df.set_index("Serbian Word", inplace=True)
sentiment_dict = sentiment_df.to_dict(orient="index")

In [None]:
def get_emotion_scores(tokens, lexicon):
    scores = np.zeros(len(next(iter(lexicon.values()))))
    for token in tokens:
        if token in lexicon:
            scores += np.array(list(lexicon[token].values()))
    return scores

In [None]:
import numpy as np
emotion_columns = sentiment_df.columns.tolist()

df[emotion_columns] = df['clean_tokens'].apply(lambda tokens: pd.Series(get_emotion_scores(tokens, sentiment_dict)))

In [None]:
# Add a column with the dominant emotion
df['dominant_emotion'] = df[emotion_columns].idxmax(axis=1)

# Or just sum total emotion occurrence across all speeches
total_emotions = df[emotion_columns].sum().sort_values(ascending=False)
print(total_emotions)

In [None]:
party_map = pd.read_csv("/home/vuk/Documents/0 Data Science/parliament_nlp_analysis/0 data/04 metadata/party_map.csv")
df['speaker'] = df['speaker'].str.upper().str.strip()
party_map['speaker'] = party_map['speaker'].str.upper().str.strip()

df = df.merge(party_map, on='speaker', how='left')

In [None]:
emotion_cols = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']

total_emotions = df[emotion_cols].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x=total_emotions.index, y=total_emotions.values, hue=total_emotions.index, palette="viridis", dodge=False, legend=False)
plt.title("Total Emotion Counts Across All Speeches")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Group by party
party_emotions = df.groupby('party')[emotion_cols].sum()

# Normalize if you want relative proportions
party_emotions_norm = party_emotions.div(party_emotions.sum(axis=1), axis=0)

In [None]:
party_emotions_long = party_emotions_norm.reset_index().melt(
    id_vars='party',
    var_name='emotion',
    value_name='score'
)

In [None]:
# Define your save path
output_dir = "/home/vuk/Documents/0 Data Science/parliament_nlp_analysis/03 figures"

unique_parties = party_emotions_long['party'].unique()

for party in unique_parties:
    subset = party_emotions_long[party_emotions_long['party'] == party]

    plt.figure(figsize=(6, 4))
    ax = sns.barplot(
        data=subset,
        x='emotion',
        y='score',
        hue='emotion',
        dodge=False,
        palette='viridis',
        legend=False
    )
    plt.title(f"Emotion Proportions – {party}")
    plt.ylabel("Proportion")
    plt.xticks(rotation=45)

    # Add value labels
    for i, row in enumerate(subset.itertuples()):
        ax.text(i, row.score + 0.0005, f"{row.score:.2f}", ha='center', va='bottom', fontsize=8)
    plt.subplots_adjust(top=0.50)
    plt.tight_layout()

    # Sanitize filename
    filename = f"emotions_{party.lower().replace(' ', '_').replace('/', '_')}.png"
    full_path = os.path.join(output_dir, filename)
    plt.savefig(full_path, dpi=300)
    plt.show()  
    plt.close()

    print(f"Saved: {full_path}")

## Summary (Optional Export)

In [None]:
print(df.columns.tolist())

In [None]:
# Export final DataFrame with clean text and sentiment
df[['speaker', 'speech', 'clean_text', 'speech_len', 'positive', 'negative', 'dominant_emotion']].to_csv(
    "/home/vuk/Documents/0 Data Science/parliament_nlp_analysis/0 data/02 interim/cleaned_speeches.csv",
    index=False
)