# Serbian Parliament NLP analysis

## Setup & Imports

In [None]:
# Imports
import json
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import logging
from gensim import corpora, models
import stanza

In [None]:
# Logging
logging.basicConfig(level=logging.INFO)

In [None]:
# Stanza pipeline
nlp = stanza.Pipeline("sr", processors="tokenize,pos,lemma", use_gpu=False)

In [None]:
# Config
DATA_PATH = "/home/vuk/Documents/0 Data Science/parliament-nlp-analysis/data/speeches.json"
STOPWORDS_PATH = "/home/vuk/Documents/0 Data Science/parliament-nlp-analysis/stopwords_serbian.txt"
NUM_TOPICS = 5

## Data Loading

In [None]:
def load_data(path):
    try:
        with open(path, encoding="utf-8") as f:
            data = json.load(f)
        logging.info(f"Loaded {len(data)} records from {path}")
        return pd.DataFrame(data)
    except FileNotFoundError:
        logging.error(f"File not found: {path}")
        return pd.DataFrame()

In [None]:
def load_stopwords(path):
    with open(path, encoding="utf-8") as f:
        return set(line.strip().lower() for line in f if line.strip())

In [None]:
# Load
df = load_data(DATA_PATH)
stopwords = load_stopwords(STOPWORDS_PATH)

## Preprocessing

In [None]:
def tokenize_and_lemmatize(text, stopwords):
    doc = nlp(text)
    return [
        word.lemma.lower()
        for sent in doc.sentences
        for word in sent.words
        if word.lemma and word.lemma.lower() not in stopwords and len(word.lemma) > 3
    ]

In [None]:
# Basic Cleaning
df['speech'] = df['speech'].astype(str)
df['speaker'] = df['speaker'].astype(str)
df['speech_len'] = df['speech'].str.len()

In [None]:
# Tokenize and Clean
df['clean_tokens'] = df['speech'].apply(lambda x: tokenize_and_lemmatize(x, stopwords))
df['clean_text'] = df['clean_tokens'].apply(lambda tokens: ' '.join(tokens))

## Visualizations

In [None]:
def plot_wordcloud(frequencies, title='Most Common Non-Stop Words'):
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(frequencies)
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

In [None]:
def plot_histogram(data, title, xlabel, ylabel, bins=20, xlim=None):
    plt.figure(figsize=(8, 5))
    data.hist(bins=bins, color='skyblue')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    if xlim:
        plt.xlim(*xlim)
    plt.grid(True)
    plt.show()

## WordCloud + Length Distribution

In [None]:
# Word Frequency
word_freq = Counter()
df['clean_tokens'].apply(lambda tokens: word_freq.update(tokens))

In [None]:
# Plot
plot_wordcloud(word_freq)
plot_histogram(df['speech_len'], 'Distribution of Speech Lengths', 'Speech Length (Characters)', 'Number of Speeches', xlim=(0, 30000))

Speeches follow a typical power-law distribution — a few are very long, but most are short and to the point. You could use this to analyze verbosity by speaker or party.

## Topic Modeling

In [None]:
def topic_modeling(texts, num_topics=5):
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)
    return lda.print_topics(num_words=15)

In [None]:
# Topics
topics = topic_modeling(df['clean_tokens'].tolist(), NUM_TOPICS)
for i, topic in topics:
    print(f"Topic {i}: {topic}")

| Topic | Label                               |
| ----- | ----------------------------------- |
| 0     | General Discourse / Filler Topic    |
| 1     | Conversational Rhetoric             |
| 2     | Economy and Industry                |
| 3     | National Policy and Legal Framework |
| 4     | Legislative Procedures and Voting   |

## Sentiment Analysis

In [None]:
def simple_sentiment(tokens, pos_words, neg_words):
    pos = sum(1 for t in tokens if t in pos_words)
    neg = sum(1 for t in tokens if t in neg_words)
    return pos - neg

In [None]:
# Example lexicons
positive_words = {'добро', 'хвала', 'напредак', 'подржавам'}
negative_words = {'лоше', 'корупција', 'лаж', 'критика'}

df['sentiment'] = df['clean_tokens'].apply(lambda tokens: simple_sentiment(tokens, positive_words, negative_words))

plot_histogram(df['sentiment'], 'Sentiment Score Distribution', 'Sentiment Score', 'Number of Speeches')

Most speeches are neutral-to-mildly positive in tone, with few outliers. You might need a more nuanced sentiment lexicon for better accuracy in Serbian.

## Summary (Optional Export)

In [None]:
# Export final DataFrame with clean text and sentiment
df[['speaker', 'speech', 'clean_text', 'speech_len', 'sentiment']].to_csv("cleaned_speeches.csv", index=False)
print("Exported cleaned data.")