# Importo librerías a utilizar #

In [1]:
import pyspark
from pyspark.sql import SparkSession

from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

# Inicializo una SparkSession #

In [2]:
spark = SparkSession.builder.getOrCreate()

# Creo DataFrames de distintos métodos #

In [3]:
df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [4]:
df = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [5]:
pandas_df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [2., 3., 4.],
    'c': ['string1', 'string2', 'string3'],
    'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
    'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
})
df = spark.createDataFrame(pandas_df)
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [6]:
rdd = spark.sparkContext.parallelize([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
])
df = spark.createDataFrame(rdd, schema=['a', 'b', 'c', 'd', 'e'])
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [7]:
df.show()
df.printSchema()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



# Visualización de los datos #

In [8]:
df.show(1)

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+
only showing top 1 row



Como alternativa, puede habilitar la configuración spark.sql.repl.eagerEval.enabled para la evaluación diligente de PySpark DataFrame en notebooks como Jupyter. La cantidad de filas que se mostrarán se puede controlar mediante la configuración spark.sql.repl.eagerEval.maxNumRows.

In [9]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
df

a,b,c,d,e
1,2.0,string1,2000-01-01,2000-01-01 12:00:00
2,3.0,string2,2000-02-01,2000-01-02 12:00:00
3,4.0,string3,2000-03-01,2000-01-03 12:00:00


In [10]:
df.show(1, vertical=True)

-RECORD 0------------------
 a   | 1                   
 b   | 2.0                 
 c   | string1             
 d   | 2000-01-01          
 e   | 2000-01-01 12:00:00 
only showing top 1 row



In [11]:
df.columns

['a', 'b', 'c', 'd', 'e']

In [12]:
df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [13]:
df.select("a", "b", "c").describe().show()

+-------+---+---+-------+
|summary|  a|  b|      c|
+-------+---+---+-------+
|  count|  3|  3|      3|
|   mean|2.0|3.0|   NULL|
| stddev|1.0|1.0|   NULL|
|    min|  1|2.0|string1|
|    max|  3|4.0|string3|
+-------+---+---+-------+



In [14]:
df.collect()

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)),
 Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [15]:
df.take(1)

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))]

In [16]:
df.toPandas()

Unnamed: 0,a,b,c,d,e
0,1,2.0,string1,2000-01-01,2000-01-01 12:00:00
1,2,3.0,string2,2000-02-01,2000-01-02 12:00:00
2,3,4.0,string3,2000-03-01,2000-01-03 12:00:00


# Seleccionar y acceder a los datos #

In [17]:
df.a

Column<'a'>

In [18]:
from pyspark.sql import Column
from pyspark.sql.functions import upper

type(df.c) == type(upper(df.c)) == type(df.c.isNull())

True

In [19]:
df.select(df.c).show()

+-------+
|      c|
+-------+
|string1|
|string2|
|string3|
+-------+



In [20]:
df.withColumn('upper_c', upper(df.c)).show()

+---+---+-------+----------+-------------------+-------+
|  a|  b|      c|         d|                  e|upper_c|
+---+---+-------+----------+-------------------+-------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|STRING1|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|STRING2|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|STRING3|
+---+---+-------+----------+-------------------+-------+



In [21]:
df.filter(df.a == 1).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+



# Aplicando funciones #

PySpark admite varias UDF y API para permitir que los usuarios ejecuten funciones nativas de Python. Por ejemplo, el siguiente ejemplo permite a los usuarios usar directamente las API en una serie de pandas dentro de una función nativa de Python.

In [22]:
import pandas as pd
from pyspark.sql.functions import pandas_udf

In [23]:
@pandas_udf('long')
def pandas_plus_one(series: pd.Series) -> pd.Series:
    # Simply plus one by using pandas Series.
    return series + 1

df.select(pandas_plus_one(df.a)).show()

+------------------+
|pandas_plus_one(a)|
+------------------+
|                 2|
|                 3|
|                 4|
+------------------+



In [24]:
def pandas_filter_func(iterator):
    for pandas_df in iterator:
        yield pandas_df[pandas_df.a == 1]

df.mapInPandas(pandas_filter_func, schema=df.schema).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+



# Agrupando datos #

In [25]:
df = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



In [26]:
df.groupby('color').avg().show()

+-----+-------+-------+
|color|avg(v1)|avg(v2)|
+-----+-------+-------+
|  red|    4.8|   48.0|
| blue|    3.0|   30.0|
|black|    6.0|   60.0|
+-----+-------+-------+



In [27]:
def plus_mean(pandas_df):
    return pandas_df.assign(v1=pandas_df.v1 - pandas_df.v1.mean())

df.groupby('color').applyInPandas(plus_mean, schema=df.schema).show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  0| 60|
| blue|banana| -1| 20|
| blue| grape|  1| 40|
|  red|banana| -3| 10|
|  red|carrot| -1| 30|
|  red|carrot|  0| 50|
|  red|banana|  2| 70|
|  red| grape|  3| 80|
+-----+------+---+---+



In [28]:
df1 = spark.createDataFrame(
    [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
    ('time', 'id', 'v1'))

df2 = spark.createDataFrame(
    [(20000101, 1, 'x'), (20000101, 2, 'y')],
    ('time', 'id', 'v2'))

def asof_join(l, r):
    return pd.merge_asof(l, r, on='time', by='id')

df1.groupby('id').cogroup(df2.groupby('id')).applyInPandas(
    asof_join, schema='time int, id int, v1 double, v2 string').show()

+--------+---+---+---+
|    time| id| v1| v2|
+--------+---+---+---+
|20000101|  1|1.0|  x|
|20000102|  1|3.0|  x|
|20000101|  2|2.0|  y|
|20000102|  2|4.0|  y|
+--------+---+---+---+



# Entrada y salida de datos #

## CSV

In [80]:
df.write.csv('foo.csv', header=True, mode="overwrite")
spark.read.csv('foo.csv', header=True).show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  6| 60|
| blue|banana|  2| 20|
|  red|banana|  1| 10|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



## PARQUET

In [82]:
df.write.parquet('bar.parquet', mode="overwrite")
spark.read.parquet('bar.parquet').show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  6| 60|
| blue|banana|  2| 20|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|  red|banana|  7| 70|
|  red|banana|  1| 10|
|  red|carrot|  3| 30|
|  red| grape|  8| 80|
+-----+------+---+---+



## ORC

In [84]:
df.write.orc('zoo.orc', mode="overwrite")
spark.read.orc('zoo.orc').show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  7| 70|
|  red| grape|  8| 80|
|black|carrot|  6| 60|
| blue|banana|  2| 20|
|  red|banana|  1| 10|
|  red|carrot|  5| 50|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
+-----+------+---+---+



# Trabajando con SQL #

In [32]:
df.createOrReplaceTempView("tableA")
spark.sql("SELECT count(*) from tableA").show()

+--------+
|count(1)|
+--------+
|       8|
+--------+



In [33]:
@pandas_udf("integer")
def add_one(s: pd.Series) -> pd.Series:
    return s + 1

spark.udf.register("add_one", add_one)
spark.sql("SELECT add_one(v1) FROM tableA").show()

+-----------+
|add_one(v1)|
+-----------+
|          2|
|          3|
|          4|
|          5|
|          6|
|          7|
|          8|
|          9|
+-----------+



In [34]:
from pyspark.sql.functions import expr

In [35]:
df.selectExpr('add_one(v1)').show()
df.select(expr('count(*)') > 0).show()

+-----------+
|add_one(v1)|
+-----------+
|          2|
|          3|
|          4|
|          5|
|          6|
|          7|
|          8|
|          9|
+-----------+

+--------------+
|(count(1) > 0)|
+--------------+
|          true|
+--------------+



# Inicio rápido: API de Pandas en Spark #

In [36]:
import pandas as pd
import numpy as np
import pyspark.pandas as ps
from pyspark.sql import SparkSession



* Creación de objetos:
Crear una serie de pandas-on-Spark pasando una lista de valores, permitiendo que la API de pandas en Spark cree un índice entero predeterminado:

In [37]:
s = ps.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [38]:
psdf = ps.DataFrame(
    {'a': [1, 2, 3, 4, 5, 6],
     'b': [100, 200, 300, 400, 500, 600],
     'c': ["one", "two", "three", "four", "five", "six"]},
    index=[10, 20, 30, 40, 50, 60])
psdf

Unnamed: 0,a,b,c
10,1,100,one
20,2,200,two
30,3,300,three
40,4,400,four
50,5,500,five
60,6,600,six


In [39]:
dates = pd.date_range('20130101', periods=6)
dates
pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
pdf

Unnamed: 0,A,B,C,D
2013-01-01,-0.367045,-0.981291,-0.614921,-0.355913
2013-01-02,0.932696,0.597765,-1.118993,-1.572401
2013-01-03,0.502658,1.212432,-1.320696,-1.080589
2013-01-04,1.715236,-0.442291,1.368208,0.003849
2013-01-05,0.214364,-0.692755,0.109878,-0.219534
2013-01-06,0.199766,-0.252793,-0.478801,-1.650257


In [41]:
psdf = ps.from_pandas(pdf)
type(psdf)

pyspark.pandas.frame.DataFrame

In [42]:
psdf

Unnamed: 0,A,B,C,D
2013-01-01,-0.367045,-0.981291,-0.614921,-0.355913
2013-01-02,0.932696,0.597765,-1.118993,-1.572401
2013-01-03,0.502658,1.212432,-1.320696,-1.080589
2013-01-04,1.715236,-0.442291,1.368208,0.003849
2013-01-05,0.214364,-0.692755,0.109878,-0.219534
2013-01-06,0.199766,-0.252793,-0.478801,-1.650257


In [43]:
spark = SparkSession.builder.getOrCreate()
sdf = spark.createDataFrame(pdf)
sdf.show()

+-------------------+--------------------+-------------------+--------------------+
|                  A|                   B|                  C|                   D|
+-------------------+--------------------+-------------------+--------------------+
|-0.3670450674172628| -0.9812908835215707| -0.614921046812434| -0.3559126604219527|
| 0.9326960462136177|  0.5977648837464683|-1.1189928439823946| -1.5724007302316794|
| 0.5026584753394927|   1.212432468720428|-1.3206957611282975| -1.0805892263550108|
| 1.7152364140076737|-0.44229109378871406| 1.3682082775139146|0.003848997723393...|
|0.21436408706448984| -0.6927545467684585|0.10987753374800197| -0.2195343036626632|
|0.19976564696501145|-0.25279272444539885|-0.4788014831156336| -1.6502566791648736|
+-------------------+--------------------+-------------------+--------------------+



In [44]:
psdf = sdf.pandas_api()
psdf

Unnamed: 0,A,B,C,D
0,-0.367045,-0.981291,-0.614921,-0.355913
1,0.932696,0.597765,-1.118993,-1.572401
2,0.502658,1.212432,-1.320696,-1.080589
3,1.715236,-0.442291,1.368208,0.003849
4,0.214364,-0.692755,0.109878,-0.219534
5,0.199766,-0.252793,-0.478801,-1.650257


In [45]:
psdf.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [46]:
psdf.head()

Unnamed: 0,A,B,C,D
0,-0.367045,-0.981291,-0.614921,-0.355913
1,0.932696,0.597765,-1.118993,-1.572401
2,0.502658,1.212432,-1.320696,-1.080589
3,1.715236,-0.442291,1.368208,0.003849
4,0.214364,-0.692755,0.109878,-0.219534


In [48]:
psdf.index

Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [49]:
psdf.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [50]:
psdf.to_numpy()



array([[-0.36704507, -0.98129088, -0.61492105, -0.35591266],
       [ 0.93269605,  0.59776488, -1.11899284, -1.57240073],
       [ 0.50265848,  1.21243247, -1.32069576, -1.08058923],
       [ 1.71523641, -0.44229109,  1.36820828,  0.003849  ],
       [ 0.21436409, -0.69275455,  0.10987753, -0.2195343 ],
       [ 0.19976565, -0.25279272, -0.47880148, -1.65025668]])

In [51]:
psdf.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.532946,-0.093155,-0.342554,-0.812474
std,0.718473,0.83398,0.978138,0.718006
min,-0.367045,-0.981291,-1.320696,-1.650257
25%,0.199766,-0.692755,-1.118993,-1.572401
50%,0.214364,-0.442291,-0.614921,-1.080589
75%,0.932696,0.597765,0.109878,-0.219534
max,1.715236,1.212432,1.368208,0.003849


In [52]:
psdf.T

Unnamed: 0,0,1,2,3,4,5
A,-0.367045,0.932696,0.502658,1.715236,0.214364,0.199766
B,-0.981291,0.597765,1.212432,-0.442291,-0.692755,-0.252793
C,-0.614921,-1.118993,-1.320696,1.368208,0.109878,-0.478801
D,-0.355913,-1.572401,-1.080589,0.003849,-0.219534,-1.650257


In [53]:
psdf.sort_index(ascending=False)

Unnamed: 0,A,B,C,D
5,0.199766,-0.252793,-0.478801,-1.650257
4,0.214364,-0.692755,0.109878,-0.219534
3,1.715236,-0.442291,1.368208,0.003849
2,0.502658,1.212432,-1.320696,-1.080589
1,0.932696,0.597765,-1.118993,-1.572401
0,-0.367045,-0.981291,-0.614921,-0.355913


In [54]:
psdf.sort_values(by='B')

Unnamed: 0,A,B,C,D
0,-0.367045,-0.981291,-0.614921,-0.355913
4,0.214364,-0.692755,0.109878,-0.219534
3,1.715236,-0.442291,1.368208,0.003849
5,0.199766,-0.252793,-0.478801,-1.650257
1,0.932696,0.597765,-1.118993,-1.572401
2,0.502658,1.212432,-1.320696,-1.080589


# Datos faltantes #

In [55]:
pdf1 = pdf.reindex(index=dates[0:4], columns=list(pdf.columns) + ['E'])
pdf1.loc[dates[0]:dates[1], 'E'] = 1
psdf1 = ps.from_pandas(pdf1)
psdf1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.367045,-0.981291,-0.614921,-0.355913,1.0
2013-01-02,0.932696,0.597765,-1.118993,-1.572401,1.0
2013-01-03,0.502658,1.212432,-1.320696,-1.080589,
2013-01-04,1.715236,-0.442291,1.368208,0.003849,


In [56]:
psdf1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.367045,-0.981291,-0.614921,-0.355913,1.0
2013-01-02,0.932696,0.597765,-1.118993,-1.572401,1.0


In [57]:
psdf1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.367045,-0.981291,-0.614921,-0.355913,1.0
2013-01-02,0.932696,0.597765,-1.118993,-1.572401,1.0
2013-01-03,0.502658,1.212432,-1.320696,-1.080589,5.0
2013-01-04,1.715236,-0.442291,1.368208,0.003849,5.0


# Operacioness

## Estadística

In [58]:
psdf.mean()

A    0.532946
B   -0.093155
C   -0.342554
D   -0.812474
dtype: float64

## Configuraciones de Spark

In [59]:
prev = spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")  # Keep its default value.
ps.set_option("compute.default_index_type", "distributed")  # Use default index prevent overhead.
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings coming from Arrow optimizations.

In [60]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)
%timeit ps.range(300000).to_pandas()

228 ms ± 26.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [61]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", False)
%timeit ps.range(300000).to_pandas()

1.19 s ± 127 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [62]:
ps.reset_option("compute.default_index_type")
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", prev)  # Set its default value back.

# Agrupaciones

In [64]:
psdf = ps.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})
psdf

Unnamed: 0,A,B,C,D
0,foo,one,-0.47953,1.214565
1,bar,one,-0.248678,0.852322
2,foo,two,0.064281,1.790076
3,bar,three,0.895084,2.698925
4,foo,two,-0.760154,0.824861
5,bar,two,-0.186017,-1.178444
6,foo,one,0.538806,1.400224
7,foo,three,-0.706009,0.240852


In [65]:
psdf.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
foo,-1.342606,5.470578
bar,0.460389,2.372802


In [66]:
psdf.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
foo,one,0.059276,2.614789
bar,one,-0.248678,0.852322
foo,two,-0.695872,2.614937
bar,three,0.895084,2.698925
bar,two,-0.186017,-1.178444
foo,three,-0.706009,0.240852


# Graficación

In [73]:
pip install plotly==5.23.0

Collecting plotly==5.23.0
  Downloading plotly-5.23.0-py3-none-any.whl.metadata (7.3 kB)
Collecting tenacity>=6.2.0 (from plotly==5.23.0)
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Downloading plotly-5.23.0-py3-none-any.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tenacity-9.0.0-py3-none-any.whl (28 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.23.0 tenacity-9.0.0
Note: you may need to restart the kernel to use updated packages.


In [75]:
pser = pd.Series(np.random.randn(1000),
                 index=pd.date_range('1/1/2000', periods=1000))

In [76]:
psser = ps.Series(pser)

In [77]:
psser = psser.cummax()

In [78]:
psser.plot()

In [79]:
pdf = pd.DataFrame(np.random.randn(1000, 4), index=pser.index,
                   columns=['A', 'B', 'C', 'D'])
psdf = ps.from_pandas(pdf)
psdf = psdf.cummax()
psdf.plot()

# Entrada y salida de datos

## CSV.

In [85]:
psdf.to_csv('foo.csv')
ps.read_csv('foo.csv').head(10)

Unnamed: 0,A,B,C,D
0,0.114543,0.425051,1.499162,0.668994
1,0.114543,0.425051,1.499162,1.421866
2,0.114543,0.425051,1.499162,1.421866
3,0.752672,0.425051,1.499162,1.421866
4,0.752672,0.425051,1.499162,1.619868
5,0.752672,1.031866,1.499162,1.619868
6,1.252836,2.18715,1.499162,1.619868
7,1.252836,2.18715,1.499162,1.619868
8,1.252836,2.18715,1.85704,1.619868
9,1.252836,2.18715,1.85704,1.619868


## Parquet

In [86]:
psdf.to_parquet('bar.parquet')
ps.read_parquet('bar.parquet').head(10)

Unnamed: 0,A,B,C,D
0,0.114543,0.425051,1.499162,0.668994
1,0.114543,0.425051,1.499162,1.421866
2,0.114543,0.425051,1.499162,1.421866
3,0.752672,0.425051,1.499162,1.421866
4,0.752672,0.425051,1.499162,1.619868
5,0.752672,1.031866,1.499162,1.619868
6,1.252836,2.18715,1.499162,1.619868
7,1.252836,2.18715,1.499162,1.619868
8,1.252836,2.18715,1.85704,1.619868
9,1.252836,2.18715,1.85704,1.619868


## Spark IO

In [87]:
psdf.to_spark_io('zoo.orc', format="orc")
ps.read_spark_io('zoo.orc', format="orc").head(10)

Unnamed: 0,A,B,C,D
0,0.114543,0.425051,1.499162,0.668994
1,0.114543,0.425051,1.499162,1.421866
2,0.114543,0.425051,1.499162,1.421866
3,0.752672,0.425051,1.499162,1.421866
4,0.752672,0.425051,1.499162,1.619868
5,0.752672,1.031866,1.499162,1.619868
6,1.252836,2.18715,1.499162,1.619868
7,1.252836,2.18715,1.499162,1.619868
8,1.252836,2.18715,1.85704,1.619868
9,1.252836,2.18715,1.85704,1.619868


Fuente: https://spark.apache.org/docs/3.3.1/api/python/getting_started/install.html