In [None]:
%pip install pysentimiento

In [None]:
import pandas as pd
import os
from pysentimiento import create_analyzer
from google.colab import drive
from collections import defaultdict

In [None]:
drive.mount('/content/drive')

GENERAL ANALYSIS

In [None]:
# Load sentiment and hate-speech analyzers
sentiment_analyzer = create_analyzer(task="sentiment", lang="es")
hate_analyzer = create_analyzer(task="hate_speech", lang="es")

In [None]:
def analyze_sentiment_text(text_series, analyzer):
    results = text_series.map(lambda x: analyzer.predict(x))
    labels, scores = zip(*[(r.output, r.probas) for r in results])
    return labels, scores

# Mean probability of each hate label
def analyze_hate_probabilities(text_series, analyzer):
    # Sum for each label, we need a dictionary since the output is a dict of three labels and their probability (for each text)
    prob_sums = defaultdict(float)
    count = 0

    for text in text_series:
        result = analyzer.predict(text)
        for label, prob in result.probas.items():
            prob_sums[label] += prob
        count += 1

    if count == 0:
        return None

    avg_probs = {label: prob_sum / count for label, prob_sum in prob_sums.items()}
    return avg_probs


# Sentiment score
def process_folder_sentiment(party_path):
    sentiment_scores = []
    for file_name in os.listdir(party_path):
        if not file_name.endswith(".csv"):
            continue
        file_path = os.path.join(party_path, file_name)
        try:
            df = pd.read_csv(file_path, delimiter='\t')
        except pd.errors.EmptyDataError:
            continue

        if 'video_description' not in df.columns or 'id' not in df.columns:
            continue

        df = df[df['video_description'].notna() & (df['video_description'] != '')]
        if df.empty:
            continue


        labels, _ = analyze_sentiment_text(df['video_description'], sentiment_analyzer)
        df['sentiment_label'] = labels
        score_map = {'POS': 1, 'NEU': 0, 'NEG': -1}
        df['sentiment_score'] = df['sentiment_label'].map(score_map)

        sentiment_scores.extend(df['sentiment_score'].tolist())

    if len(sentiment_scores) == 0:
        print("No sentiment scores computed for this party.")
        return None

    avg_score = sum(sentiment_scores) / len(sentiment_scores)
    print(f"Average sentiment score for party at {party_path}: {avg_score:.3f}")
    return avg_score


# Function to analyse hate speech
def process_party_hate_avg(party_path, analyzer):
    all_texts = []

    for filename in os.listdir(party_path): # Iterates over each csv file
        if not filename.endswith(".csv"):
            continue
        filepath = os.path.join(party_path, filename)
        try:
            df = pd.read_csv(filepath, delimiter="\t", on_bad_lines='skip')
        except pd.errors.EmptyDataError:
            continue

        # Do the same for voice_to_text
        if 'video_description' not in df.columns:
            continue

        texts = df['video_description'].dropna().astype(str)
        texts = texts[texts != '']

        all_texts.extend(texts.tolist())

    if not all_texts:
        return None

    # Analyse and return the probabilities for each label
    return analyze_hate_probabilities(all_texts, analyzer)



In [None]:
# Path to the Party's folder
party_path = 'PARTY_FOLDER'

In [None]:
process_folder_sentiment(party_path)

In [None]:
process_party_hate_avg(party_path, hate_analyzer)

TOTAL NUM OF VIDEOS PER PARTY

In [None]:
base_path = 'CSV_PATH' # Path where the files are saved

video_counts = {}

for party in os.listdir(base_path):
    party_path = os.path.join(base_path, party)
    if os.path.isdir(party_path):
        total_lines = 0
        for file_name in os.listdir(party_path):
            if file_name.endswith('.csv'):
                file_path = os.path.join(party_path, file_name)
                try:
                    df = pd.read_csv(file_path, on_bad_lines='skip')
                    total_lines += len(df)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
        video_counts[party] = total_lines

video_counts_df = pd.DataFrame(list(video_counts.items()), columns=['Party', 'Total num videos'])
video_counts_df = video_counts_df.sort_values(by='Total num videos', ascending=False).reset_index(drop=True)

print(video_counts_df)