# Data Processing using Pyspark

In [None]:
#configuración en google colab de spark y pyspark
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#instalar java y spark
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
!tar xf spark-3.5.5-bin-hadoop3.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.5-bin-hadoop3"

In [None]:
import findspark
findspark.init()

In [None]:
!mkdir -p /content/jars
!wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar -P /content/jars
!wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1034/aws-java-sdk-bundle-1.11.1034.jar -P /content/jars

In [None]:
from pyspark.sql import SparkSession

jars = "/content/jars/hadoop-aws-3.3.4.jar,/content/jars/aws-java-sdk-bundle-1.11.1034.jar"

spark = SparkSession.builder \
    .appName("S3Connection") \
    .master("local[*]") \
    .config("spark.jars", jars) \
    .config('fs.s3a.access.key', "AWS_ACCESS_KEY") \
    .config('fs.s3a.secret.key', "AWS_SECRET_KEY") \
    .config('fs.s3a.session.token',"AWS_SESSION_TOKEN") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .getOrCreate()

sc = spark.sparkContext

In [None]:
# Load csv Dataset 
# desde S3
df=spark.read.csv('s3://bucke_name/datasets/sample_data.csv',inferSchema=True,header=True)

In [None]:
#columns of dataframe
df

In [None]:
#check number of columns
len(df.columns)

In [None]:
#number of records in dataframe
df.columns

In [None]:
#shape of dataset
print((df.count(),len(df.columns)))

In [None]:
#printSchema
df.printSchema()

In [None]:
#fisrt few rows of dataframe
df.show(5)