# Suomi24 Corpus: Yearly Evolution of Hate and Friendly Speech

This notebook loads message data from PostgreSQL, aggregates yearly counts for hate speech (Query 1), friendly speech (Query 2), and both, and visualizes the results.

In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import os

In [None]:
# Connect to PostgreSQL using SQLAlchemy and load data into pandas DataFrame
db_user = os.environ.get("POSTGRES_USER", "postgres")
db_password = os.environ.get("POSTGRES_PASSWORD", "secret")
db_host = "localhost"
db_port = os.environ.get("POSTGRES_PORT", 5432)
db_name = os.environ.get("POSTGRES_DB", "suomi24")

# Create SQLAlchemy engine
engine = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")

query = """
SELECT date, query_type, content FROM messages
WHERE date IS NOT NULL
"""
df = pd.read_sql(query, engine)

df['year'] = pd.to_datetime(df['date']).dt.year

In [None]:
# Aggregate yearly counts for each query type
agg = df.groupby(['year', 'query_type']).size().unstack(fill_value=0)
agg['both'] = agg.get('both', 0)
agg['hate'] = agg.get('hate', 0)
agg['friendly'] = agg.get('friendly', 0)
agg = agg[['hate', 'friendly', 'both']]
agg

In [None]:
plt.figure(figsize=(12,6))
agg.plot(kind='line', marker='o', ax=plt.gca())
plt.title('Yearly Evolution of Hate, Friendly, and Both Speech in Suomi24')
plt.xlabel('Year')
plt.ylabel('Number of Messages')
plt.grid(True)
plt.legend(title='Query Type')
plt.xlim(2001, 2017)
plt.xticks(range(2001, 2018))
plt.tight_layout()
plt.show()

In [None]:

df['token_count'] = df['content'].str.split().apply(len)
df_filtered = df[df['query_type'].isin(['hate', 'friendly'])]
avg_tokens = df_filtered.groupby(['year', 'query_type'])['token_count'].mean().unstack(fill_value=0)
avg_tokens

In [None]:
plt.figure(figsize=(12,6))
avg_tokens.plot(kind='line', marker='o', ax=plt.gca())
plt.title('Yearly Evolution of Average Message Size  (Tokens) in Suomi24')
plt.xlabel('Year')
plt.ylabel('Average Token Count')
plt.grid(True)
plt.legend(title='Query Type')
plt.xlim(2001, 2017)
plt.xticks(range(2001, 2018))
plt.tight_layout()
plt.show()