In [29]:
# training
# A neural network implemented in PyTorch.
# At least one other Scikit-learn algorithm (for example, K-NN, SVM, Random Forest, Logistic Regression, etc.).
import pandas as pd
import numpy as np
from sklearn import neighbors

In [30]:
# traindata = pd.read_csv('Datasets/PractiseData/development.csv', header=0)
traindata1 = pd.read_csv('Datasets/EvaluationData/politicES_phase_2_test_codalab.csv', header=0)
traindata2 = pd.read_csv('Datasets/EvaluationData/politicES_phase_2_test_public.csv', header=0)
traindata3 = pd.read_csv('Datasets/PractiseData/development.csv', header=0)
traindata = pd.concat([traindata1, traindata2, traindata3], ignore_index=True)
ytrain = traindata.iloc[:, :]
# print(ytrain)

## Exploratory Analysis of the Dataset

Before applying language processing techniques, a exploratory analysis of the dataset is performed. This analysis includes:
- General description: Number of instances, variables, data types, missing values
- Basic statistics and simple visualizations (class distribution, text lengths, etc.)
- Preliminary text analysis (most frequent words, word cloud, examples by class)
- Formulation of initial hypoteses about possible relationships between disinfomation and polarization.

### General description

In [31]:
# Shape of the training data
print("Training data shape:", ytrain.shape)

# Print the headers of the dataset
print("Headers:", ytrain.columns.tolist())

# Print the data types of each column
for col in ytrain.columns:
    first_non_null = ytrain[col].dropna().iloc[0] if not ytrain[col].dropna().empty else None
    padding = ' ' * (max(len(c) for c in ytrain.columns) - len(col))
    print(f"  Column '{col}' {padding} -> type: {type(first_non_null).__name__}")

# Users are in the first column, extract unique users from ytrain
unique_users = ytrain.iloc[:, 0].unique()
print("Unique users (labels) in training data:", len(unique_users))


Training data shape: (101920, 6)
Headers: ['label', 'gender', 'profession', 'ideology_binary', 'ideology_multiclass', 'tweet']
  Column 'label'                -> type: str
  Column 'gender'               -> type: str
  Column 'profession'           -> type: str
  Column 'ideology_binary'      -> type: str
  Column 'ideology_multiclass'  -> type: str
  Column 'tweet'                -> type: str
Unique users (labels) in training data: 907


### Statics and simple visualizations

In [37]:
# Compute class distribution
class_counts = ytrain.iloc[:, 0].value_counts()
class_counts_unique = class_counts.nunique()

if class_counts_unique == 1:
    print("All classes have the same number of samples.")
    print(f"Each class has {class_counts.iloc[0]} samples.")
else:
    print("Class distribution in training data:")
    # Group consecutive classes with same count
    prev_count = None
    start_cls = None
    
    for cls, count in class_counts.items():
        if count != prev_count:
            if prev_count is not None:
                if start_cls == prev_cls:
                    print(f"  Classes {start_cls}: {prev_count} samples")
                else:
                    print(f"  Classes {start_cls} to {prev_cls}: {prev_count} samples")
            start_cls = cls
            prev_count = count
        prev_cls = cls
    
    # Print the last group
    if start_cls == prev_cls:
        print(f"  Classes {start_cls}: {prev_count} samples")
    else:
        print(f"  Classes {start_cls} to {prev_cls}: {prev_count} samples")




Class distribution in training data:
  Classes 015f536ad54d267bef6d9a08019f735f to ff5a2270310c6dee99e1caf7de9b7c8c: 160 samples
  Classes 00369358fac3b8d42845f82f0c3ececc to ffcc655c22b6d8b96857db135b61b61a: 40 samples
