<a href="https://colab.research.google.com/github/saitzaw/apache-spark-colab/blob/main/Spark_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar -xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark

In [None]:
import os
import findspark
from pyspark.sql import SparkSession

In [None]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
findspark.init()
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
csv_file_path = '/content/gdrive/MyDrive/ColabDataset/cars.csv'
df = spark.read.csv(csv_file_path, header=True, sep=";")

In [None]:
df.show(5, truncate=False)

+-------------------------+----+---------+------------+----------+------+------------+-----+------+
|Car                      |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+-------------------------+----+---------+------------+----------+------+------------+-----+------+
|Chevrolet Chevelle Malibu|18.0|8        |307.0       |130.0     |3504. |12.0        |70   |US    |
|Buick Skylark 320        |15.0|8        |350.0       |165.0     |3693. |11.5        |70   |US    |
|Plymouth Satellite       |18.0|8        |318.0       |150.0     |3436. |11.0        |70   |US    |
|AMC Rebel SST            |16.0|8        |304.0       |150.0     |3433. |12.0        |70   |US    |
|Ford Torino              |17.0|8        |302.0       |140.0     |3449. |10.5        |70   |US    |
+-------------------------+----+---------+------------+----------+------+------------+-----+------+
only showing top 5 rows



In [None]:
df.limit(5)

Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
Chevrolet Chevell...,18.0,8,307.0,130.0,3504.0,12.0,70,US
Buick Skylark 320,15.0,8,350.0,165.0,3693.0,11.5,70,US
Plymouth Satellite,18.0,8,318.0,150.0,3436.0,11.0,70,US
AMC Rebel SST,16.0,8,304.0,150.0,3433.0,12.0,70,US
Ford Torino,17.0,8,302.0,140.0,3449.0,10.5,70,US


In [None]:
df.printSchema()

root
 |-- Car: string (nullable = true)
 |-- MPG: string (nullable = true)
 |-- Cylinders: string (nullable = true)
 |-- Displacement: string (nullable = true)
 |-- Horsepower: string (nullable = true)
 |-- Weight: string (nullable = true)
 |-- Acceleration: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Origin: string (nullable = true)



### Note 
- using Spark, we need to take care datatype 
- unlike Pandas, we should declare nullable column in pyspark

### Datatype conversion in pySpark
- note string type cannot do mathematical operation such as add, sub, mult so on ... 

In [None]:
from pyspark.sql.types import (
    StringType,
    DoubleType,
    IntegerType,
    StructType,
    StructField
)

In [None]:
to_change_cols_dtypes = [
                         ('Car', StringType()),
                         ('MPG', DoubleType()),
                         ('Cylinders', IntegerType()),
                         ('Displacement', DoubleType()),
                         ('Horsepower', DoubleType()),
                         ('Weight', DoubleType()),
                         ('Acceleration', DoubleType()),
                         ('Model', IntegerType()),
                         ('Origin', StringType())

]

In [None]:
schema = StructType(
    [
     StructField(
        x[0], x[1], True
      )
     for x in to_change_cols_dtypes
    ]
)

In [None]:
df = spark.read.csv(
    csv_file_path,
    header=True,
    sep=";",
    schema=schema)

In [None]:
df.printSchema()

root
 |-- Car: string (nullable = true)
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: double (nullable = true)
 |-- Weight: double (nullable = true)
 |-- Acceleration: double (nullable = true)
 |-- Model: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [None]:
df.show(5, truncate=False)

+-------------------------+----+---------+------------+----------+------+------------+-----+------+
|Car                      |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+-------------------------+----+---------+------------+----------+------+------------+-----+------+
|Chevrolet Chevelle Malibu|18.0|8        |307.0       |130.0     |3504.0|12.0        |70   |US    |
|Buick Skylark 320        |15.0|8        |350.0       |165.0     |3693.0|11.5        |70   |US    |
|Plymouth Satellite       |18.0|8        |318.0       |150.0     |3436.0|11.0        |70   |US    |
|AMC Rebel SST            |16.0|8        |304.0       |150.0     |3433.0|12.0        |70   |US    |
|Ford Torino              |17.0|8        |302.0       |140.0     |3449.0|10.5        |70   |US    |
+-------------------------+----+---------+------------+----------+------+------------+-----+------+
only showing top 5 rows



### Count the number of row and columns

In [None]:
df.count()

406

In [None]:
df.distinct().count()

406

In [25]:
len(df.columns)

9

In [27]:
df.select('Car').distinct().show()

+--------------------+
|                 Car|
+--------------------+
|Volkswagen 1131 D...|
|Chevrolete Chevel...|
|Chevrolet Monte C...|
|     Ford LTD Landau|
|       Honda Prelude|
|      Chevrolet Nova|
|   Volkswagen Rabbit|
|     Ford Torino 500|
|        Toyota Camry|
|         Audi 100 LS|
|Plymouth Valiant ...|
|Toyota Corolla Ma...|
|Oldsmobile Cutlas...|
|Fiat 124 Sport Coupe|
|     Volvo 145e (sw)|
|Chevrolet Caprice...|
|            Audi Fox|
|    Chevrolet Camaro|
|       Dodge Aspen 6|
|    Pontiac Catalina|
+--------------------+
only showing top 20 rows



In [29]:
[x for x in df.select('Car').distinct().collect()]

[Row(Car='Volkswagen 1131 Deluxe Sedan'),
 Row(Car='Chevrolete Chevelle Malibu'),
 Row(Car='Chevrolet Monte Carlo Landau'),
 Row(Car='Ford LTD Landau'),
 Row(Car='Honda Prelude'),
 Row(Car='Chevrolet Nova'),
 Row(Car='Volkswagen Rabbit'),
 Row(Car='Ford Torino 500'),
 Row(Car='Toyota Camry'),
 Row(Car='Audi 100 LS'),
 Row(Car='Plymouth Valiant Custom'),
 Row(Car='Toyota Corolla Mark II (sw)'),
 Row(Car='Oldsmobile Cutlass Supreme'),
 Row(Car='Fiat 124 Sport Coupe'),
 Row(Car='Volvo 145e (sw)'),
 Row(Car='Chevrolet Caprice Classic'),
 Row(Car='Audi Fox'),
 Row(Car='Chevrolet Camaro'),
 Row(Car='Dodge Aspen 6'),
 Row(Car='Pontiac Catalina'),
 Row(Car='AMC Ambassador Brougham'),
 Row(Car='Ford Maverick'),
 Row(Car='Chevrolet Vega'),
 Row(Car='Plymouth Fury III'),
 Row(Car='Datsun 200-SX'),
 Row(Car='Plymouth Volare Premier v8'),
 Row(Car='Plymouth Arrow GS'),
 Row(Car='Mazda RX2 Coupe'),
 Row(Car='Subaru DL'),
 Row(Car='Dodge Aspen SE'),
 Row(Car='Mazda GLC Custom'),
 Row(Car='Datsun 610'

### Filter
- select column with specified record
- select multi columns with required record values

In [30]:
df.filter("Horsepower > 130")

Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
Buick Skylark 320,15.0,8,350.0,165.0,3693.0,11.5,70,US
Plymouth Satellite,18.0,8,318.0,150.0,3436.0,11.0,70,US
AMC Rebel SST,16.0,8,304.0,150.0,3433.0,12.0,70,US
Ford Torino,17.0,8,302.0,140.0,3449.0,10.5,70,US
Ford Galaxie 500,15.0,8,429.0,198.0,4341.0,10.0,70,US
Chevrolet Impala,14.0,8,454.0,220.0,4354.0,9.0,70,US
Plymouth Fury iii,14.0,8,440.0,215.0,4312.0,8.5,70,US
Pontiac Catalina,14.0,8,455.0,225.0,4425.0,10.0,70,US
AMC Ambassador DPL,15.0,8,390.0,190.0,3850.0,8.5,70,US
Chevrolet Chevell...,0.0,8,350.0,165.0,4142.0,11.5,70,US


### Tweak 
- When we needed to check with stringType please use Pandas style

In [33]:
df.filter((df['Horsepower'] > 130)
& (df['Origin'] == 'Europe'))

Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
Peugeot 604sl,16.2,6,163.0,133.0,3410.0,15.8,78,Europe
