# Data Processing using Pyspark in Google Colab with AWS

In [None]:
#configuraci贸n en google colab de spark y pyspark with AWS
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#instalar java y spark
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
!tar xf spark-3.3.1-bin-hadoop3.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop3"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("data_processing")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.amazonaws:aws-java-sdk:1.11.1034,org.apache.hadoop:hadoop-aws:3.3.4")\
    .config('fs.s3a.access.key', 'AWS_ACCESS_KEY') \
    .config('fs.s3a.secret.key', 'AWS_SECRET_KEY') \
    .config('fs.s3a.session.token','AWS_SESSION_TOKEN') \
    .config('fs.s3a.path.style.access', 'true') \
    .config('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem') \
    .config('fs.s3a.endpoint', 's3.amazonaws.com') \
    .getOrCreate()
    
sc = spark.sparkContext

In [None]:
spark

In [None]:
sc

In [None]:
# Load csv Dataset 
#
# desde S3
df=spark.read.csv('s3a://notebooksochoac/datasets/covid19/Casos_positivos_de_COVID-19_en_Colombia-100K.csv',inferSchema=True,header=True)
#
# desde los datos ejemplo que trae google colab:
#
#df=spark.read.csv('sample_data/california_housing_test.csv',inferSchema=True,header=True)
#
# desde su carpeta del trabajo3 en google drive
#
# df=spark.read.csv('gdrive/MyDrive/st0263-2266/bigdata/datasets/Casos_positivos_de_COVID-19_en_Colombia.csv',inferSchema=True,header=True)
#




In [None]:
#Columns
df.columns

In [None]:
#Datatypes
df.printSchema()

In [None]:
#select only 5 columns
df.select('Nombre municipio','Edad', 'Sexo','Estado', 'Fecha de diagn贸stico').show(5)

In [None]:
#Change column name
df.select('Nombre Departamento', 'Edad').withColumnRenamed('Nombre Departamento', 'Dep').show(5)

In [None]:
#Drop columns
df.select('fecha reporte web', 'ID de caso', 'Fecha de notificaci贸n', 'Nombre departamento').drop('Nombre departamento', 'fecha de notificaci贸n').show(5)

In [None]:
#filter the records 
df.filter(df['Nombre departamento']=='ANTIOQUIA').show(5)

In [None]:
# UDF
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

#normal function 
def price_range(dep):
    if dep in ['SANTANDER','VAUPES', 'CASANARE', 'CUNDINAMARCA','GUAVIARE','TOLIMA', 'CARTAGENA', 'CHOCO', 'SAN ANDRES', 'VICHADA']:
        return 'Tierra caliente'
    else:
        return 'Tierra fria'

#create udf using python function
brand_udf=udf(price_range,StringType())
#apply udf on dataframe
df.select('Nombre departamento').withColumn('Tipo de clima',brand_udf(df['Nombre departamento'])).show(10,False)