In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import numpy as np
import pandas as pd

spark = SparkSession\
    .builder\
    .appName("Python example")\
    .getOrCreate()

In [9]:
df = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['purple', 'banana', 2, 20], ['white', 'carrot', 3, 30],
    ['blue', 'banana', 4, 40], ['green', 'grape', 5, 50], ['blue', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80], ['white', 'banana', 9, 90],
],  schema=['color', 'fruit', 'v1', 'v2'])

df.show()

+------+------+---+---+
| color| fruit| v1| v2|
+------+------+---+---+
|   red|banana|  1| 10|
|purple|banana|  2| 20|
| white|carrot|  3| 30|
|  blue|banana|  4| 40|
| green| grape|  5| 50|
|  blue|carrot|  6| 60|
|   red|banana|  7| 70|
|   red| grape|  8| 80|
| white|banana|  9| 90|
+------+------+---+---+



In [10]:
df.groupby("color").avg().show()

+------+-----------------+------------------+
| color|          avg(v1)|           avg(v2)|
+------+-----------------+------------------+
| green|              5.0|              50.0|
|purple|              2.0|              20.0|
| white|              6.0|              60.0|
|   red|5.333333333333333|53.333333333333336|
|  blue|              5.0|              50.0|
+------+-----------------+------------------+



In [3]:
data = np.random.randint(1000, size=250)

pandas_df = pd.DataFrame(data, columns=['numbers'])

df = spark.createDataFrame(pandas_df)
df.select(df.numbers, f.reverse(df.numbers)).show()

+-------+----------------+
|numbers|reverse(numbers)|
+-------+----------------+
|    921|             129|
|     89|              98|
|    513|             315|
|    548|             845|
|    511|             115|
|    988|             889|
|    879|             978|
|    115|             511|
|    750|             057|
|    111|             111|
|    110|             011|
|    654|             456|
|     82|              28|
|    635|             536|
|    428|             824|
|    133|             331|
|    338|             833|
|    499|             994|
|    299|             992|
|    514|             415|
+-------+----------------+
only showing top 20 rows



In [17]:
df = spark.createDataFrame([
    ['Vaca'], ['Cachorro'], ['Gato'], ['Touro'], ['Galinha'],
    ['Cobra'], ['Girafa'], ['Rinoceronte'], ['Macaco'], ['Ornitorrinco'],
    ['Leão'], ['Tigre'], ['Papagaio'], ['Tubarão'], ['Pavão'],
    ['Leopardo'], ['Avestruz'], ['Aranha'], ['Abelha'], ['Vespa']
], schema=['Animal'])

animal_df = df.orderBy('Animal')
 
for line in animal_df.collect():
    print(line)

animal_df.show()
animal_df.coalesce(1).write.mode('overwrite').csv('animal')

Row(Animal='Abelha')
Row(Animal='Aranha')
Row(Animal='Avestruz')
Row(Animal='Cachorro')
Row(Animal='Cobra')
Row(Animal='Galinha')
Row(Animal='Gato')
Row(Animal='Girafa')
Row(Animal='Leopardo')
Row(Animal='Leão')
Row(Animal='Macaco')
Row(Animal='Ornitorrinco')
Row(Animal='Papagaio')
Row(Animal='Pavão')
Row(Animal='Rinoceronte')
Row(Animal='Tigre')
Row(Animal='Touro')
Row(Animal='Tubarão')
Row(Animal='Vaca')
Row(Animal='Vespa')
+------------+
|      Animal|
+------------+
|      Abelha|
|      Aranha|
|    Avestruz|
|    Cachorro|
|       Cobra|
|     Galinha|
|        Gato|
|      Girafa|
|    Leopardo|
|        Leão|
|      Macaco|
|Ornitorrinco|
|    Papagaio|
|       Pavão|
| Rinoceronte|
|       Tigre|
|       Touro|
|     Tubarão|
|        Vaca|
|       Vespa|
+------------+

