In [1]:
"""Import SpeakLeash dataset into a pandas / polars dataframe."""

import os
import gc

from speakleash import Speakleash
import pandas as pd
import polars

In [2]:
# Select dataset - "thesis" is small dataset but with "plwiki" memory usage is: 4.2GB (pandas) vs 2.3GB (polars)
PROJECT = "plwiki"

# Initiating directory
base_dir = os.path.join('datasets')
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# Initiating Speakleash
sl = Speakleash(base_dir)

In [3]:
# Get data
data = sl.get(PROJECT).ext_data

100%|██████████| 870M/870M [01:12<00:00, 12.0MiB/s] 


In [4]:
# Creating pandas DataFrame
pandas_df = pd.DataFrame({"text": s[0], **s[1]} for s in data)

In [5]:
pandas_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1469920 entries, 0 to 1469919
Data columns (total 26 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   text                 1469920 non-null  object 
 1   title                1469920 non-null  object 
 2   url                  1469920 non-null  object 
 3   characters           1469920 non-null  int64  
 4   sentences            1469920 non-null  int64  
 5   words                1469920 non-null  int64  
 6   verbs                1469920 non-null  int64  
 7   nouns                1469920 non-null  int64  
 8   punctuations         1469920 non-null  int64  
 9   symbols              1469920 non-null  int64  
 10  stopwords            1469920 non-null  int64  
 11  oovs                 1469920 non-null  int64  
 12  camel_case           1469920 non-null  int64  
 13  avg_sentence_length  1469920 non-null  float64
 14  adverbs              1469920 non-null  int64  
 15

In [6]:
pandas_df

Unnamed: 0,text,title,url,characters,sentences,words,verbs,nouns,punctuations,symbols,...,avg_word_length,noun_ratio,verb_ratio,adj_ratio,lexical_density,gunning_fog,pos_x,pos_num,capitalized_words,quality
0,"AWK – interpretowany język programowania, któr...",AWK,https://pl.wikipedia.org/wiki?curid=2,10578,112,1510,129,523,379,12,...,5.7404,0.3464,0.0854,0.1411,0.3629,10.17,62,6,59,MEDIUM
1,Alergologia – dziedzina medycyny zajmująca się...,Alergologia,https://pl.wikipedia.org/wiki?curid=4,312,2,39,1,13,9,0,...,6.7949,0.3333,0.0256,0.1795,0.9744,18.13,3,0,1,LOW
2,"ASCII (czyt. ""aski"", skrót od ang. ""American S...",ASCII,https://pl.wikipedia.org/wiki?curid=6,8574,82,1182,83,366,340,2,...,5.9704,0.3096,0.0702,0.1717,0.4425,11.70,58,16,69,MEDIUM
3,Atom – podstawowy składnik materii. Składa się...,Atom,https://pl.wikipedia.org/wiki?curid=7,43393,383,5896,570,2074,1000,14,...,6.1891,0.3518,0.0967,0.1710,0.2799,10.62,89,87,92,MEDIUM
4,"Aksjomat, postulat, pewnik (gr. ""axíōma"", godn...",Aksjomat,https://pl.wikipedia.org/wiki?curid=8,6584,62,878,92,264,155,0,...,6.3235,0.3007,0.1048,0.1686,0.3964,11.15,5,4,4,HIGH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1469915,Pałac w Ostrowiniu – zabytkowy pałac wybudowan...,Pałac w Ostrowiniu,https://pl.wikipedia.org/wiki?curid=5473853,608,6,83,0,32,18,0,...,6.1084,0.3855,0.0000,0.2651,0.7470,11.39,3,1,3,HIGH
1469916,Partizanskaja () – przystanek kolejowy przy os...,Partizanskaja (przystanek kolejowy w obwodzie ...,https://pl.wikipedia.org/wiki?curid=5473859,199,3,25,0,7,11,0,...,6.4000,0.2800,0.0000,0.2000,0.8800,9.72,0,0,0,LOW
1469917,Letni Puchar Kontynentalny w skokach narciarsk...,Letni Puchar Kontynentalny w skokach narciarsk...,https://pl.wikipedia.org/wiki?curid=5473864,438,6,58,4,16,10,0,...,6.3621,0.2759,0.0690,0.2931,0.7414,10.09,2,1,2,HIGH
1469918,Szczyt NATO w w Paryżu 1957 lub 1. Szczyt NATO...,Szczyt NATO w Paryżu 1957,https://pl.wikipedia.org/wiki?curid=5473866,1225,9,154,12,57,21,0,...,6.8247,0.3701,0.0779,0.1948,0.7338,15.74,2,1,7,LOW


In [7]:
del data
del pandas_df
gc.collect()

0

In [8]:
# Get data
data = sl.get(PROJECT).ext_data

In [9]:
# Creating polars DataFrame
polars_df = polars.DataFrame({"text": s[0], **s[1]} for s in data)

In [10]:
print(polars_df.schema)
print(polars_df.shape)
mem_size = polars_df.estimated_size("mb")
print(f"Polars memory usage: {mem_size:.2f} MB = {(mem_size / 1000):.2f} GB")

{'text': Utf8, 'title': Utf8, 'url': Utf8, 'characters': Int64, 'sentences': Int64, 'words': Int64, 'verbs': Int64, 'nouns': Int64, 'punctuations': Int64, 'symbols': Int64, 'stopwords': Int64, 'oovs': Int64, 'camel_case': Int64, 'avg_sentence_length': Float64, 'adverbs': Int64, 'adjectives': Int64, 'avg_word_length': Float64, 'noun_ratio': Float64, 'verb_ratio': Float64, 'adj_ratio': Float64, 'lexical_density': Float64, 'gunning_fog': Float64, 'pos_x': Int64, 'pos_num': Int64, 'capitalized_words': Int64, 'quality': Utf8}
(1469920, 26)
Polars memory usage: 2280.31 MB = 2.28 GB


In [11]:
polars_df

text,title,url,characters,sentences,words,verbs,nouns,punctuations,symbols,stopwords,oovs,camel_case,avg_sentence_length,adverbs,adjectives,avg_word_length,noun_ratio,verb_ratio,adj_ratio,lexical_density,gunning_fog,pos_x,pos_num,capitalized_words,quality
str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,f64,f64,f64,f64,f64,f64,i64,i64,i64,str
"""AWK – interpre…","""AWK""","""https://pl.wik…",10578,112,1510,129,523,379,12,453,79,0,17.75,40,213,5.7404,0.3464,0.0854,0.1411,0.3629,10.17,62,6,59,"""MEDIUM"""
"""Alergologia – …","""Alergologia""","""https://pl.wik…",312,2,39,1,13,9,0,10,2,0,24.0,0,7,6.7949,0.3333,0.0256,0.1795,0.9744,18.13,3,0,1,"""LOW"""
"""ASCII (czyt. ""…","""ASCII""","""https://pl.wik…",8574,82,1182,83,366,340,2,378,60,0,18.8659,23,203,5.9704,0.3096,0.0702,0.1717,0.4425,11.7,58,16,69,"""MEDIUM"""
"""Atom – podstaw…","""Atom""","""https://pl.wik…",43393,383,5896,570,2074,1000,14,1827,217,0,18.3525,182,1008,6.1891,0.3518,0.0967,0.171,0.2799,10.62,89,87,92,"""MEDIUM"""
"""Aksjomat, post…","""Aksjomat""","""https://pl.wik…",6584,62,878,92,264,155,0,323,33,0,17.1129,30,148,6.3235,0.3007,0.1048,0.1686,0.3964,11.15,5,4,4,"""HIGH"""
"""Arytmetyka (ła…","""Arytmetyka""","""https://pl.wik…",4322,32,570,41,192,125,0,167,24,0,22.1562,17,98,6.3561,0.3368,0.0719,0.1719,0.6053,13.85,21,14,11,"""HIGH"""
"""Alkeny – organ…","""Alkeny""","""https://pl.wik…",2359,25,315,25,120,83,0,101,45,0,16.6,12,47,6.2317,0.381,0.0794,0.1492,0.5651,11.97,8,4,4,"""HIGH"""
"""ActiveX – prze…","""ActiveX""","""https://pl.wik…",2287,22,293,19,103,48,1,87,12,0,16.0455,6,40,6.6348,0.3515,0.0648,0.1365,0.5734,12.82,1,0,22,"""HIGH"""
"""Interfejs prog…","""Interfejs prog…","""https://pl.wik…",3131,22,408,23,168,131,9,118,9,0,25.2273,9,51,6.326,0.4118,0.0564,0.125,0.5245,13.6,14,0,38,"""LOW"""
"""AmigaOS – syst…","""AmigaOS""","""https://pl.wik…",3999,39,553,33,201,133,0,114,27,0,17.8718,7,94,6.0127,0.3635,0.0597,0.17,0.5588,10.56,37,3,32,"""HIGH"""


In [12]:
del data
del polars_df
gc.collect()

0