In [1]:
import gzip
import json
import pandas as pd

from nltk import sent_tokenize, word_tokenize
from pathlib import Path
from tqdm.notebook import tqdm

# Load Data

In [2]:
DATA_FILE = "../../data/argument-mining-data.jl.gz"

dataset = []
with gzip.open(DATA_FILE, "rt") as fh:
    for line in tqdm(fh):
        dataset.append(json.loads(line))

dataset = pd.DataFrame(dataset)

0it [00:00, ?it/s]

# Statistics

In [3]:
dataset["no_of_sentences"] = dataset["text"].map(lambda t: len(sent_tokenize(t)))
dataset["no_of_words"] = dataset["text"].map(lambda t: len(word_tokenize(t)))

## Sentences Statistics

### Descriptive Statistics

In [4]:
dataset["no_of_sentences"].describe()

count    20590.000000
mean         1.971491
std          3.784651
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max        125.000000
Name: no_of_sentences, dtype: float64

In [5]:
dataset.groupby("dataset")["no_of_sentences"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bcause,10086.0,1.714257,1.154006,1.0,1.0,1.0,2.0,22.0
ceps,48.0,9.291667,6.205551,2.0,4.0,8.0,12.25,28.0
rie,426.0,7.253521,7.04784,1.0,2.0,5.0,10.0,42.0
swedish-parliament,477.0,12.125786,15.559351,1.0,3.0,7.0,13.0,90.0
taejae-academy,103.0,15.330097,22.261101,1.0,2.0,7.0,18.5,125.0
touche-23,9450.0,1.312593,0.795705,1.0,1.0,1.0,1.0,10.0


### Total Sentences

In [6]:
dataset["no_of_sentences"].sum()

40593

In [7]:
dataset.groupby("dataset")["no_of_sentences"].sum()

dataset
bcause                17290
ceps                    446
rie                    3090
swedish-parliament     5784
taejae-academy         1579
touche-23             12404
Name: no_of_sentences, dtype: int64

## Words Statistics

### Descriptive Statistics

In [8]:
dataset["no_of_words"].describe()

count    20590.000000
mean        36.864352
std         80.321764
min          1.000000
25%         15.000000
50%         24.000000
75%         37.000000
max       2539.000000
Name: no_of_words, dtype: float64

In [9]:
dataset.groupby("dataset")["no_of_words"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bcause,10086.0,33.475114,25.626208,1.0,17.0,30.0,43.0,473.0
ceps,48.0,177.979167,114.989822,33.0,86.75,156.5,250.75,488.0
rie,426.0,124.171362,148.716204,2.0,19.0,65.5,183.5,923.0
swedish-parliament,477.0,234.473795,337.816353,2.0,34.0,127.0,264.0,1868.0
taejae-academy,103.0,335.048544,502.431292,4.0,25.0,168.0,403.5,2539.0
touche-23,9450.0,22.60455,14.883067,4.0,13.0,19.0,27.0,164.0


### Total Words

In [10]:
dataset["no_of_words"].sum()

759037

In [11]:
dataset.groupby("dataset")["no_of_words"].sum()

dataset
bcause                337630
ceps                    8543
rie                    52897
swedish-parliament    111844
taejae-academy         34510
touche-23             213613
Name: no_of_words, dtype: int64