In [1]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
csv = spark.read.csv("./data/people_data.csv", header=True, inferSchema=True) #lee el archivo y lo transforma a un dataframe

In [4]:
type(csv)

pyspark.sql.dataframe.DataFrame

In [5]:
csv.show(10)

+---------+---+------+------------------+-----------------+
|     Name|Age|Gender|            Height|           Weight|
+---------+---+------+------------------+-----------------+
|  Eduardo| 34|Female|154.77356283348317|65.40959594878004|
|Alejandro| 72|  Male|175.09077870318868|67.71691582543606|
|  Gustavo| 73|  Male|184.72151535794515|69.43436307585084|
|   Marcos| 66|  Male| 167.2833431342786|67.40999627643211|
|    Jorge| 53|Female|180.71948252374528|88.49667898234189|
|  Luciana| 20|Female|168.64632227790082|78.91118118197696|
| Gabriela| 35|  Male| 174.3897278047251|67.40077470974775|
|   Daniel| 79|  Male|174.40544987517276|99.25533855454273|
|  Gonzalo| 35|  Male|156.97306377798276|51.65642033937228|
|  Martina| 24|Female| 190.8518248828354|61.34849834025222|
+---------+---+------+------------------+-----------------+
only showing top 10 rows



In [6]:
csv.select("Name", "Age").show()

+---------+---+
|     Name|Age|
+---------+---+
|  Eduardo| 34|
|Alejandro| 72|
|  Gustavo| 73|
|   Marcos| 66|
|    Jorge| 53|
|  Luciana| 20|
| Gabriela| 35|
|   Daniel| 79|
|  Gonzalo| 35|
|  Martina| 24|
|  Agustín| 44|
|  Silvana| 22|
|  Martina| 29|
|  Claudia| 24|
|   Carlos| 23|
|    Carla| 32|
|   Javier| 50|
|    Sofía| 61|
|    María| 45|
|  Ricardo| 20|
+---------+---+
only showing top 20 rows



In [7]:
csv.select("Age").filter(F.col("Age")>30).show()

+---+
|Age|
+---+
| 34|
| 72|
| 73|
| 66|
| 53|
| 35|
| 79|
| 35|
| 44|
| 32|
| 50|
| 61|
| 45|
| 33|
| 69|
| 48|
| 44|
| 58|
| 40|
| 49|
+---+
only showing top 20 rows



In [8]:
csv.orderBy(F.desc("Age")).collect()

[Row(Name='Daniel', Age=79, Gender='Male', Height=174.40544987517276, Weight=99.25533855454273),
 Row(Name='Natalia', Age=79, Gender='Female', Height=164.42298938366673, Weight=51.27256960487532),
 Row(Name='Mateo', Age=79, Gender='Female', Height=181.07434395680204, Weight=84.56437570925465),
 Row(Name='Miguel', Age=79, Gender='Male', Height=197.60280526702556, Weight=93.48503732496546),
 Row(Name='Vanesa', Age=79, Gender='Male', Height=188.5913640236602, Weight=69.19965248620028),
 Row(Name='Susana', Age=79, Gender='Female', Height=165.4903320870794, Weight=51.45878870104935),
 Row(Name='Martina', Age=79, Gender='Female', Height=195.2732336507824, Weight=99.77514021598395),
 Row(Name='María', Age=79, Gender='Male', Height=172.85465820266347, Weight=61.26776614142906),
 Row(Name='Juan', Age=79, Gender='Male', Height=198.89207561788353, Weight=82.72866381121341),
 Row(Name='Vanesa', Age=79, Gender='Male', Height=182.7013235009142, Weight=51.73753892263363),
 Row(Name='Javier', Age=79, 

In [9]:
csv.groupBy("Gender").avg("Height").show()

+------+------------------+
|Gender|       avg(Height)|
+------+------------------+
|Female|174.96148501826968|
|  Male|175.03511208362153|
+------+------------------+



In [10]:
csv.groupBy("Gender").agg({"Height":"avg"}).show()

+------+------------------+
|Gender|       avg(Height)|
+------+------------------+
|Female|174.96148501826968|
|  Male|175.03511208362153|
+------+------------------+



In [11]:
csv.withColumn("Height(pulgadas)", csv.Height/2.54).show()

+---------+---+------+------------------+-----------------+------------------+
|     Name|Age|Gender|            Height|           Weight|  Height(pulgadas)|
+---------+---+------+------------------+-----------------+------------------+
|  Eduardo| 34|Female|154.77356283348317|65.40959594878004|60.934473556489436|
|Alejandro| 72|  Male|175.09077870318868|67.71691582543606| 68.93337744220027|
|  Gustavo| 73|  Male|184.72151535794515|69.43436307585084|  72.7250060464351|
|   Marcos| 66|  Male| 167.2833431342786|67.40999627643211|  65.8595839111333|
|    Jorge| 53|Female|180.71948252374528|88.49667898234189| 71.14940256840366|
|  Luciana| 20|Female|168.64632227790082|78.91118118197696|  66.3961898731893|
| Gabriela| 35|  Male| 174.3897278047251|67.40077470974775| 68.65737315146657|
|   Daniel| 79|  Male|174.40544987517276|99.25533855454273|  68.6635629429814|
|  Gonzalo| 35|  Male|156.97306377798276|51.65642033937228| 61.80041881022943|
|  Martina| 24|Female| 190.8518248828354|61.34849834

In [12]:
csv.drop("height(pulgadas)").show()

+---------+---+------+------------------+-----------------+
|     Name|Age|Gender|            Height|           Weight|
+---------+---+------+------------------+-----------------+
|  Eduardo| 34|Female|154.77356283348317|65.40959594878004|
|Alejandro| 72|  Male|175.09077870318868|67.71691582543606|
|  Gustavo| 73|  Male|184.72151535794515|69.43436307585084|
|   Marcos| 66|  Male| 167.2833431342786|67.40999627643211|
|    Jorge| 53|Female|180.71948252374528|88.49667898234189|
|  Luciana| 20|Female|168.64632227790082|78.91118118197696|
| Gabriela| 35|  Male| 174.3897278047251|67.40077470974775|
|   Daniel| 79|  Male|174.40544987517276|99.25533855454273|
|  Gonzalo| 35|  Male|156.97306377798276|51.65642033937228|
|  Martina| 24|Female| 190.8518248828354|61.34849834025222|
|  Agustín| 44|  Male|174.16319667544008|50.84032522188359|
|  Silvana| 22|  Male|174.98421257451335|73.80294629313435|
|  Martina| 29|  Male|  154.596571612654|73.32912451569628|
|  Claudia| 24|Female|165.74437374643756

In [13]:
csv.select(F.col("Name").alias("FullName")).show()

+---------+
| FullName|
+---------+
|  Eduardo|
|Alejandro|
|  Gustavo|
|   Marcos|
|    Jorge|
|  Luciana|
| Gabriela|
|   Daniel|
|  Gonzalo|
|  Martina|
|  Agustín|
|  Silvana|
|  Martina|
|  Claudia|
|   Carlos|
|    Carla|
|   Javier|
|    Sofía|
|    María|
|  Ricardo|
+---------+
only showing top 20 rows



In [14]:
csv.dropna().show()

+---------+---+------+------------------+-----------------+
|     Name|Age|Gender|            Height|           Weight|
+---------+---+------+------------------+-----------------+
|  Eduardo| 34|Female|154.77356283348317|65.40959594878004|
|Alejandro| 72|  Male|175.09077870318868|67.71691582543606|
|  Gustavo| 73|  Male|184.72151535794515|69.43436307585084|
|   Marcos| 66|  Male| 167.2833431342786|67.40999627643211|
|    Jorge| 53|Female|180.71948252374528|88.49667898234189|
|  Luciana| 20|Female|168.64632227790082|78.91118118197696|
| Gabriela| 35|  Male| 174.3897278047251|67.40077470974775|
|   Daniel| 79|  Male|174.40544987517276|99.25533855454273|
|  Gonzalo| 35|  Male|156.97306377798276|51.65642033937228|
|  Martina| 24|Female| 190.8518248828354|61.34849834025222|
|  Agustín| 44|  Male|174.16319667544008|50.84032522188359|
|  Silvana| 22|  Male|174.98421257451335|73.80294629313435|
|  Martina| 29|  Male|  154.596571612654|73.32912451569628|
|  Claudia| 24|Female|165.74437374643756

In [15]:
mean_age = csv.select(F.mean(F.col("Age"))).collect()[0][0]

In [16]:
csv.fillna({'Age': mean_age})

DataFrame[Name: string, Age: int, Gender: string, Height: double, Weight: double]

In [17]:
dataframe2 = spark.createDataFrame([("Andrea", "Argentina"),
                                    ("Adriel", "Brasil"),
                                    ("Josefina", "Uruguay"),
                                    ("Maria", "Chile"),
                                    ("Felix","Paraguay"),
                                   ("Camila","Paraguay"),
                                   ("Camila","Uruguay"),
                                   ("Carlos","Paraguay"),
                                   ("Laura","Colombia"),
                                   ("Maria","Colombia"),
                                   ("Diego","Argentina"),
                                   ("Martina","Chile"),
                                   ("Silvana","Venezuela"),
                                   ("Eduardo","Brasil")], schema='Name string, Country string')

In [18]:
type(dataframe2)

pyspark.sql.dataframe.DataFrame

In [19]:
df_test = dataframe2
for i in range (1000):
    df_test = df_test.union(dataframe2)
df_test.count()

14014

In [21]:
#realizo el join interno:
join_interno = csv.join(df_test, csv["Name"]==df_test["Name"], "inner")

In [22]:
join_interno.show()

+------+---+------+------------------+------------------+------+---------+
|  Name|Age|Gender|            Height|            Weight|  Name|  Country|
+------+---+------+------------------+------------------+------+---------+
|Andrea| 26|  Male| 162.4351048913221| 87.03096179645289|Andrea|Argentina|
|Andrea| 63|Female|187.59629768986446|59.341787300517694|Andrea|Argentina|
|Andrea| 55|  Male| 189.5441543908251| 60.91553343686399|Andrea|Argentina|
|Andrea| 59|  Male| 182.2754657792239|  87.3786156206954|Andrea|Argentina|
|Andrea| 57|Female|161.56579696949044| 63.08396042789558|Andrea|Argentina|
|Andrea| 50|Female|162.03038658184408| 96.92164076723981|Andrea|Argentina|
|Andrea| 78|  Male|191.20910417047608| 55.63463588969129|Andrea|Argentina|
|Andrea| 53|  Male| 197.1290675526551|  54.9112959330009|Andrea|Argentina|
|Andrea| 63|Female|170.90264698414612|54.803845242386984|Andrea|Argentina|
|Andrea| 51|Female| 181.3604222531811| 97.68709232776806|Andrea|Argentina|
|Andrea| 50|  Male| 183.1

In [32]:
df = spark.createDataFrame([
    (1, 'string1'),
    (2, 'string2'),
    (3, 'string3')], schema='a long, name string')
df.show()

+---+-------+
|  a|   name|
+---+-------+
|  1|string1|
|  2|string2|
|  3|string3|
+---+-------+



In [39]:
def upper_function(df, columna):
    return df.withColumn("upper_" + columna, F.upper(F.col(columna)))

muestra= upper_function(df, 'name')
muestra.show()

+---+-------+----------+
|  a|   name|upper_name|
+---+-------+----------+
|  1|string1|   STRING1|
|  2|string2|   STRING2|
|  3|string3|   STRING3|
+---+-------+----------+



In [40]:
def plusone_function(df, columna):
    return df.withColumn("plusone_" + columna, F.col(columna)+1)

muestra= plusone_function(csv, 'Age')
muestra.show()

+---------+---+------+------------------+-----------------+-----------+
|     Name|Age|Gender|            Height|           Weight|plusone_Age|
+---------+---+------+------------------+-----------------+-----------+
|  Eduardo| 34|Female|154.77356283348317|65.40959594878004|         35|
|Alejandro| 72|  Male|175.09077870318868|67.71691582543606|         73|
|  Gustavo| 73|  Male|184.72151535794515|69.43436307585084|         74|
|   Marcos| 66|  Male| 167.2833431342786|67.40999627643211|         67|
|    Jorge| 53|Female|180.71948252374528|88.49667898234189|         54|
|  Luciana| 20|Female|168.64632227790082|78.91118118197696|         21|
| Gabriela| 35|  Male| 174.3897278047251|67.40077470974775|         36|
|   Daniel| 79|  Male|174.40544987517276|99.25533855454273|         80|
|  Gonzalo| 35|  Male|156.97306377798276|51.65642033937228|         36|
|  Martina| 24|Female| 190.8518248828354|61.34849834025222|         25|
|  Agustín| 44|  Male|174.16319667544008|50.84032522188359|     

In [42]:
csv.write.csv('./data/new_csv.csv', header=True, mode="overwrite")

In [43]:
parquet=spark.read.parquet('bar.parquet')

In [44]:
parquet.show()

+-------------------+-------------------+------------------+------------------+
|                  A|                  B|                 C|                 D|
+-------------------+-------------------+------------------+------------------+
|0.11454265038906333|0.42505102840744696|1.4991616638603555| 0.668994073159884|
|0.11454265038906333|0.42505102840744696|1.4991616638603555| 1.421866071862577|
|0.11454265038906333|0.42505102840744696|1.4991616638603555| 1.421866071862577|
| 0.7526719975128419|0.42505102840744696|1.4991616638603555| 1.421866071862577|
| 0.7526719975128419|0.42505102840744696|1.4991616638603555|1.6198675934248319|
| 0.7526719975128419| 1.0318658164575025|1.4991616638603555|1.6198675934248319|
|  1.252835588582649| 2.1871504745259434|1.4991616638603555|1.6198675934248319|
|  1.252835588582649| 2.1871504745259434|1.4991616638603555|1.6198675934248319|
|  1.252835588582649| 2.1871504745259434|1.8570395242447004|1.6198675934248319|
|  1.252835588582649| 2.1871504745259434

In [46]:
filtro= csv.filter((F.col("Age")>30) & (F.col("Height")>160))
filtro.show()

+---------+---+------+------------------+------------------+
|     Name|Age|Gender|            Height|            Weight|
+---------+---+------+------------------+------------------+
|Alejandro| 72|  Male|175.09077870318868| 67.71691582543606|
|  Gustavo| 73|  Male|184.72151535794515| 69.43436307585084|
|   Marcos| 66|  Male| 167.2833431342786| 67.40999627643211|
|    Jorge| 53|Female|180.71948252374528| 88.49667898234189|
| Gabriela| 35|  Male| 174.3897278047251| 67.40077470974775|
|   Daniel| 79|  Male|174.40544987517276| 99.25533855454273|
|  Agustín| 44|  Male|174.16319667544008| 50.84032522188359|
|    Carla| 32|Female|177.59816205547563| 92.86561566992015|
|   Javier| 50|  Male|163.89042723619391| 88.67093760286542|
|    Sofía| 61|Female| 174.6015160854719| 95.18766519617841|
|    María| 45|Female|165.70166405037423| 77.08757014836466|
|  Silvana| 33|Female| 182.5501148524333| 80.61662116453643|
|   Martín| 69|  Male|192.26218561242598| 53.80046986691393|
|  Ricardo| 48|  Male|19

In [56]:
agrupacion = join_interno.groupBy("Gender", "Country").max("Height")
agrupacion.show()

+------+---------+------------------+
|Gender|  Country|       max(Height)|
+------+---------+------------------+
|Female|Argentina|199.97373733033967|
|  Male|Argentina|199.94247373884636|
|Female| Paraguay|199.94368721348093|
|Female|  Uruguay| 199.9347406339932|
|  Male|  Uruguay|199.96931584934472|
|  Male| Paraguay|199.99126144423732|
|  Male| Colombia|199.94651048827217|
|Female| Colombia|199.97026068844255|
|Female|    Chile| 199.8994357685822|
|  Male|    Chile|199.98621141614166|
|  Male|   Brasil|199.78317351205254|
|  Male|Venezuela| 199.9936599611383|
|Female|   Brasil|  199.966760770168|
|Female|Venezuela|199.95427219625924|
+------+---------+------------------+



In [None]:
agrupacion2 = join_interno.groupBy("Gender", "Country").agg(max("Height").alias("Max_Height"))
agrupacion2.show()

In [57]:
from pyspark.sql.window import Window

In [63]:
funcion_ventana = Window.partitionBy("Gender").orderBy("Height")
type(funcion_ventana)

pyspark.sql.window.WindowSpec

In [65]:
nuevo_df = csv.withColumn("rank", F.rank().over(funcion_ventana))
nuevo_df.show()

+---------+---+------+------------------+------------------+----+
|     Name|Age|Gender|            Height|            Weight|rank|
+---------+---+------+------------------+------------------+----+
|  Claudia| 42|Female|150.00077092085112| 61.92889886532357|   1|
|    Diego| 25|Female|150.00301128593455| 95.77813608017019|   2|
|  Nicolás| 60|Female| 150.0034002976035|  64.1839472591719|   3|
|  Marcelo| 42|Female|150.00391953450065| 89.44600933616275|   4|
|   Javier| 55|Female|150.00422875507198| 81.83341305049606|   5|
|      Sol| 29|Female|150.00468459371848|  73.3212921594511|   6|
|Elizabeth| 42|Female|150.00532857404403| 87.28073545848929|   7|
|    Carla| 46|Female| 150.0063531358863| 50.93663296546475|   8|
|    Diego| 38|Female|150.00704188107665| 62.28606411444926|   9|
|     Juan| 64|Female| 150.0084596149665| 86.89195469373625|  10|
|     Luis| 68|Female| 150.0097285334305| 71.75061254630637|  11|
|  Roberto| 29|Female|150.01016379338427| 86.33457104953513|  12|
|    Pedro

In [66]:
csv.write.parquet("./data/new_parquet", mode="overwrite")