In [1]:
import os
from pathlib import Path
import glob
import xml.etree.ElementTree as ET

import pyarrow
import pyarrow.feather as fth
import pyarrow.parquet as pq

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
data_dir = Path('..', 'data', 'pan21-author-profiling-training-2021-03-14')
data_dir

PosixPath('../data/pan21-author-profiling-training-2021-03-14')

In [3]:
def create_df(data_dir, lang, df_columns):
    lang_df = pd.DataFrame(columns=df_columns)
    
    for auth_file in sorted(glob.glob(os.path.join(data_dir, lang + '/*.xml'))):
        root = ET.parse(auth_file).getroot()
        author_id = os.path.basename(auth_file)[:-4]
        label = root.get('class')
        tweets = root.findall('./documents/*')
    
        for tweet in tweets:
            temp_df = pd.DataFrame(data=[[author_id, tweet.text, label]], columns=df_columns)
            lang_df = lang_df.append(temp_df, ignore_index=True)
        
    return lang_df

In [4]:
df_columns = ['author_id', 'tweet', 'label']
en_df = create_df(data_dir, 'en', df_columns)
en_df.shape

(40000, 3)

In [5]:
es_df = create_df(data_dir, 'es', df_columns)
es_df.shape

(40000, 3)

In [6]:
es_df.tail(10)

Unnamed: 0,author_id,tweet,label
39990,fde1d7437a12068e0e39505af6948f99,RT #USER#: Votar independentismo es esto...,1
39991,fde1d7437a12068e0e39505af6948f99,RT #USER#: Soy el único que ya está hasta las ...,1
39992,fde1d7437a12068e0e39505af6948f99,RT #USER#: Quiero que mis impuestos paguen vac...,1
39993,fde1d7437a12068e0e39505af6948f99,RT #USER#: Dime que eres facha sin decirme que...,1
39994,fde1d7437a12068e0e39505af6948f99,RT #USER#: Dos menestras podemoides yéndose a ...,1
39995,fde1d7437a12068e0e39505af6948f99,"RT #USER#: Herrera, a Iglesias: ""Es una anomal...",1
39996,fde1d7437a12068e0e39505af6948f99,RT #USER#: Dice el hijøputa de Otegi que lo má...,1
39997,fde1d7437a12068e0e39505af6948f99,"RT #USER#: Iñaqui, #USER# lo que opines de #US...",1
39998,fde1d7437a12068e0e39505af6948f99,RT #USER#: Si te gusta alguna de las putas que...,1
39999,fde1d7437a12068e0e39505af6948f99,"RT #USER#: Se ofrece tarotista fiable, serio, ...",1


In [7]:
en_table = pyarrow.Table.from_pandas(en_df)
pq.write_table(en_table, str(data_dir)+'/en_df.parquet')
es_table = pyarrow.Table.from_pandas(es_df)
pq.write_table(es_table, str(data_dir)+'/es_df.parquet')

In [8]:
en_df_parquet = pq.read_table(str(data_dir)+'/en_df.parquet').to_pandas()
es_df_parquet = pq.read_table(str(data_dir)+'/es_df.parquet').to_pandas()

In [9]:
fth.write_feather(df=en_df, dest=str(data_dir)+'/en_df.feather')
fth.write_feather(df=es_df, dest=str(data_dir)+'/es_df.feather')

In [10]:
en_df_fth = fth.read_feather(str(data_dir)+'/en_df.feather')
es_df_fth = fth.read_feather(str(data_dir)+'/es_df.feather')

In [11]:
en_df.to_csv(str(data_dir)+'/en_df.csv', index=False)
es_df.to_csv(str(data_dir)+'/es_df.csv', index=False)

In [12]:
en_df_csv = pd.read_csv(str(data_dir)+'/en_df.csv')
es_df_csv = pd.read_csv(str(data_dir)+'/es_df.csv')

In [13]:
def verify_with_truth(df, data_dir, lang):
    with open(Path(data_dir, lang, 'truth.txt'), 'r') as f:
        lines = f.readlines()
        for line in lines:
            author, label = line.split(':::')
            author_df = df.loc[df['author_id'] == author.strip()]
            
            assert author_df.shape[0] == 200
            assert (author_df['label'] == label.strip()).all()

In [14]:
verify_with_truth(es_df_parquet, data_dir, 'es')

In [15]:
def split_authorwise(data_dir, lang, train_ratio=0.8, n_splits=1):
    with open(Path(data_dir, lang, 'truth.txt'), 'r') as f:
        lines = f.readlines()
        author_labels = np.array(list(map(
            lambda l: [l.split(':::')[0].strip(), l.split(':::')[1].strip()], lines)))
        authors = author_labels[:, 0]
        labels = author_labels[:, 1]
        splits = list()
        
        sss = StratifiedShuffleSplit(n_splits=n_splits, train_size=train_ratio, random_state=0)
        for train_index, dev_index in sss.split(authors, labels):
            splits.append((authors[train_index], authors[dev_index]))
            
        return splits

In [16]:
def get_train_dev_from_split(df, author_split):
    train_split = df.loc[df.author_id.isin(author_split[0])]
    dev_split = df.loc[df.author_id.isin(author_split[1])]
    
    assert (train_split['author_id'].value_counts() == 200).all()
    assert (dev_split['author_id'].value_counts() == 200).all()
    assert (train_split['label'].value_counts() == len(author_split[0]) * 100).all()
    assert (dev_split['label'].value_counts() == len(author_split[1]) * 100).all()
    
    return train_split, dev_split

In [17]:
def get_single_split(data_dir, lang):
    df = pq.read_table(os.path.join(data_dir, lang+'_df.parquet')).to_pandas()
    author_splits = split_authorwise(data_dir, lang)
    train_split, dev_split = get_train_dev_from_split(df, author_splits[0])
    
    return train_split, dev_split

In [18]:
en_train_split, en_dev_split = get_single_split(data_dir, 'en')
en_train_split.head()

Unnamed: 0,author_id,tweet,label
200,06893abba0bb8f94fed7562350233ed7,"Romanian graftbuster’s firing violated rights,...",0
201,06893abba0bb8f94fed7562350233ed7,Russian ventilators sent to U.S. made by firm ...,0
202,06893abba0bb8f94fed7562350233ed7,Hezbollah prevented ISIS from reaching Europe:...,0
203,06893abba0bb8f94fed7562350233ed7,Epidemiologist Dr Knut Wittkowski: ‘Lockdown H...,0
204,06893abba0bb8f94fed7562350233ed7,China refuses to let WHO investigate truth beh...,0


In [19]:
es_train_split, es_dev_split = get_single_split(data_dir, 'es')
es_train_split.head()

Unnamed: 0,author_id,tweet,label
0,0035a3060d075506f5b9b978a910aa1f,#USER# pasta con bichos de agua,0
1,0035a3060d075506f5b9b978a910aa1f,De verdad puto lol de mierda qué asco de juego...,0
2,0035a3060d075506f5b9b978a910aa1f,RT #USER#: me hice una pcr y ya tengo los resu...,0
3,0035a3060d075506f5b9b978a910aa1f,"Y un lomo queso de baguette entera, tranqui #URL#",0
4,0035a3060d075506f5b9b978a910aa1f,Me cambio de curro y me llegan 3 ofertas direc...,0
