In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
!ls

custs1.txt   spark-3.1.1-bin-hadoop3.2
sample_data  spark-3.1.1-bin-hadoop3.2.tgz


In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [None]:
df = spark.read.option("header", True).option("inferSchema", True).csv("/content/custs1.txt")


In [None]:
df.printSchema()

root
 |-- cid: integer (nullable = true)
 |-- f_name: string (nullable = true)
 |-- l_name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- desig: string (nullable = true)



In [None]:
from pyspark.sql.functions import col

In [None]:
#1.to select the records f_name,l_name,age where age>40
df.select('f_name','l_name','age').where('age> 40').show()

+--------+----------+---+
|  f_name|    l_name|age|
+--------+----------+---+
|Kristina|     Chung| 55|
|   Paige|      Chen| 74|
|Gretchen|      Hill| 66|
|   Karen|   Puckett| 74|
| Patrick|      Song| 42|
|   Elsie|  Hamilton| 43|
|   Hazel|    Bender| 63|
| Dolores|McLaughlin| 60|
| Francis|  McNamara| 47|
|  Marion|      Moon| 41|
|    Beth|   Woodard| 65|
|   Julia|     Desai| 49|
|  Jerome|   Wallace| 52|
|    Neal|  Lawrence| 72|
|    Jean|   Griffin| 45|
|Kristine| Dougherty| 63|
| Crystal|    Powers| 67|
|    Eric|    Steele| 66|
|  Wesley|    Teague| 42|
|  Claire| Gallagher| 42|
+--------+----------+---+
only showing top 20 rows



In [None]:
#2.to get records which f_name starts with s
df.where(col("f_name").startswith("S")).show()

+-------+---------+-------+---+--------------------+
|    cid|   f_name| l_name|age|               desig|
+-------+---------+-------+---+--------------------+
|4000003|   Sherri| Melton| 34|         Firefighter|
|4000012|    Sandy| Raynor| 26|              Writer|
|4000030|Stephanie|Hawkins| 50|Human resources a...|
|4000035|  Shelley|  Weeks| 25|            Reporter|
|4000052|  Shirley|Merritt| 21|            Reporter|
|4000058|    Scott|  Hoyle| 40|              Doctor|
|4000077| Samantha| Hardin| 27|              Doctor|
|4000078|     Sara|  Lucas| 44|        Loan officer|
|4000079|    Stacy|  Eason| 31|            Musician|
|4000087|    Steve| Graves| 73|               Nurse|
|4000089|   Sherri| Sutton| 75|       Social worker|
|4000092|    Stacy|  Olsen| 25|        Veterinarian|
|4000101|    Scott| Golden| 27|             Teacher|
|4000108|    Shawn| Boykin| 34|        Photographer|
|4000120|     Sara|Perkins| 67|               Actor|
|4000133| Samantha|  Floyd| 72|    Childcare w

In [None]:
#3.use the like operatore l_name starts with f
df.where(col("l_name").like("F%")).show()

+-------+--------+--------+---+--------------------+
|    cid|  f_name|  l_name|age|               desig|
+-------+--------+--------+---+--------------------+
|4000133|Samantha|   Floyd| 72|    Childcare worker|
|4000135| Vincent| Fischer| 33|        Statistician|
|4000162|   Bruce| Farrell| 35|           Librarian|
|4000171| Stephen|   Finch| 30|               Coach|
|4000184|   Sarah|     Fox| 73|        Psychologist|
|4000265|   Stacy| Frazier| 69|            Designer|
|4000303|    Erin|   Finch| 54|      Police officer|
|4000362|   Holly|     Fox| 74|           Physicist|
|4000379|   Harry|  Foster| 50|               Coach|
|4000444|  Thomas|  Fuller| 51|    Childcare worker|
|4000455| Deborah|  French| 56|          Accountant|
|4000459|Gretchen| Francis| 60|          Politician|
|4000461|  Audrey| Forrest| 50|           Architect|
|4000467|   Derek|Freedman| 66|Recreation and fi...|
|4000514|  Nathan|    Ford| 36|   Financial analyst|
|4000546|  Calvin|Fletcher| 55|               

In [None]:
#get all records where design in ('Laywer','Pilot','Teacher')
df.where(col('desig').isin('Lawyer','Teacher','Pilot')).show()

+-------+---------+---------+---+-------+
|    cid|   f_name|   l_name|age|  desig|
+-------+---------+---------+---+-------+
|4000001| Kristina|    Chung| 55|  Pilot|
|4000002|    Paige|     Chen| 74|Teacher|
|4000005|    Karen|  Puckett| 74| Lawyer|
|4000007|    Elsie| Hamilton| 43|  Pilot|
|4000026|   Marian|  Solomon| 27| Lawyer|
|4000033|      Tim|    Watts| 58| Lawyer|
|4000063|  Melinda|  Proctor| 27|Teacher|
|4000101|    Scott|   Golden| 27|Teacher|
|4000109|  Vincent|   Sumner| 31| Lawyer|
|4000130|     Toni|    Glass| 46| Lawyer|
|4000167|     Lynn|Robertson| 45| Lawyer|
|4000195|   Claire|  Pickett| 59| Lawyer|
|4000229|     Faye|   Norman| 64|  Pilot|
|4000230|    Kathy|    Burch| 28|  Pilot|
|4000251|   Jeremy|    House| 61|  Pilot|
|4000271|    Alice|    Nance| 59|  Pilot|
|4000322|Geraldine|   Jensen| 50|  Pilot|
|4000336|   Steven|  Ballard| 62|  Pilot|
|4000346|    Diana|    Crane| 26| Lawyer|
|4000352|   Ernest|  Stanton| 51| Lawyer|
+-------+---------+---------+---+-

In [None]:
#find the result where age >40 and the desig is pilot
#both conditions need to be in bracket
df.where((col('age') > 50) & (col('desig')=="Pilot")).show()

+-------+----------+--------+---+-----+
|    cid|    f_name|  l_name|age|desig|
+-------+----------+--------+---+-----+
|4000001|  Kristina|   Chung| 55|Pilot|
|4000229|      Faye|  Norman| 64|Pilot|
|4000251|    Jeremy|   House| 61|Pilot|
|4000271|     Alice|   Nance| 59|Pilot|
|4000336|    Steven| Ballard| 62|Pilot|
|4000392|      Emma|   Olson| 74|Pilot|
|4000403|       Leo|Lassiter| 65|Pilot|
|4000423|      Alan|  O'Neal| 59|Pilot|
|4000562|    Jordan|   Wrenn| 61|Pilot|
|4000601|     Allan|  Nguyen| 51|Pilot|
|4000893|      Gene|    Love| 59|Pilot|
|4000912|     Maria|  Reilly| 64|Pilot|
|4000958|     Danny|  Bowers| 72|Pilot|
|4001039|   Dorothy|   Stone| 52|Pilot|
|4001092|Jacqueline|Friedman| 55|Pilot|
|4001250|  Nicholas| Leonard| 55|Pilot|
|4001424|     Jesse|   Eaton| 58|Pilot|
|4001428|   Deborah|   Britt| 64|Pilot|
|4001489|   Gregory|Saunders| 63|Pilot|
|4001494|     Jesse|  Peters| 56|Pilot|
+-------+----------+--------+---+-----+
only showing top 20 rows



In [None]:
#show the result where age is between 40 and 50
df.where(col("age").between(40,50)).show(truncate=False)

+-------+---------+---------+---+---------------------------+
|cid    |f_name   |l_name   |age|desig                      |
+-------+---------+---------+---+---------------------------+
|4000006|Patrick  |Song     |42 |Veterinarian               |
|4000007|Elsie    |Hamilton |43 |Pilot                      |
|4000011|Francis  |McNamara |47 |Therapist                  |
|4000013|Marion   |Moon     |41 |Carpenter                  |
|4000015|Julia    |Desai    |49 |Musician                   |
|4000018|Jean     |Griffin  |45 |Childcare worker           |
|4000023|Wesley   |Teague   |42 |Carpenter                  |
|4000025|Claire   |Gallagher|42 |Musician                   |
|4000028|Dwight   |Monroe   |45 |Economist                  |
|4000029|Wayne    |Connolly |40 |Real estate agent          |
|4000030|Stephanie|Hawkins  |50 |Human resources assistant  |
|4000042|Katherine|Bender   |44 |Physicist                  |
|4000045|Lois     |Joseph   |44 |Musician                   |
|4000047

In [None]:
df.where(col("desig").isNull()).show()

+-------+-------+---------+---+-----+
|    cid| f_name|   l_name|age|desig|
+-------+-------+---------+---+-----+
|4000014|   Beth|  Woodard| 65| null|
|4000046|  Louis|Rosenthal| 31| null|
|4000257| Monica|   Dodson| 58| null|
|4000327|Heather|   Dawson| 43| null|
|4000411| Joanna|   Hoover| 50| null|
|4000463|  Wayne|   Weiner| 70| null|
|4000494|Stephen|   Waller| 75| null|
|4000679|Herbert| Jernigan| 59| null|
|4000695|  Chris| Anderson| 52| null|
|4000890| Hannah|    Casey| 30| null|
|4000944|   Mary|  Boyette| 59| null|
|4001066|    Ron|   Brandt| 65| null|
|4001359|  Craig| McNamara| 31| null|
|4001400|Matthew|  Beasley| 39| null|
|4001478| Sheryl|     Lamb| 53| null|
|4001505|Raymond|    Roach| 30| null|
|4001550|  Barry|    Sykes| 34| null|
|4001561| Arlene|     Gray| 51| null|
|4001675| Wesley|   Graves| 33| null|
|4001952|  Renee|      Day| 46| null|
+-------+-------+---------+---+-----+
only showing top 20 rows



In [None]:
df.where(col("desig").isNull()).count()

83

In [None]:
txns_df = spark.read.option("header",True).csv("/content/txns_with_header.csv")

In [None]:
txns_df.show(truncate=False)

+--------+----------+-------+-------+----------------------+---------------------------------+--------------+--------------+------+
|txn_id  |txn_dt    |cid    |amt    |prod_cat              |prod                             |city          |state         |mode  |
+--------+----------+-------+-------+----------------------+---------------------------------+--------------+--------------+------+
|00000000|06-26-2011|4007024|04a0.33|Exercise & Fitness    |Cardio Machine Accessories       |Clarksville   |Tennessee     |credit|
|00000001|05-26-2011|4006742|198.44 |Exercise & Fitness    |Weightlifting Gloves             |Long Beach    |California    |credit|
|00000002|06-01-2011|4009775|005.58 |Exercise & Fitness    |Weightlifting Machine Accessories|Anaheim       |California    |credit|
|00000003|06-05-2011|4002199|198.19 |Gymnastics            |Gymnastics Rings                 |Milwaukee     |Wisconsin     |credit|
|00000004|12-17-2011|4002613|098.81 |Team Sports           |Field Hockey    

In [100]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,DecimalType,DateType,TimestampType

In [None]:
txns_schema=StructType([StructField("txn_id",IntegerType()),
                       StructField("txn_dt",DateType()),
                       StructField("cid",IntegerType()),
                       StructField("amt",DecimalType(10,3)),
                       StructField("prod_cat",StringType()),
                       StructField("prod",StringType()),
                       StructField("city",StringType()),
                       StructField("state",StringType()),
                       StructField("mode",StringType())           ])

In [None]:
txns_df = spark.read.option("header",True).schema(txns_schema).option("dateFormat","MM-dd-yyyy").csv("/content/txns_with_header.csv")

In [None]:
txns_df.printSchema()

root
 |-- txn_id: integer (nullable = true)
 |-- txn_dt: date (nullable = true)
 |-- cid: integer (nullable = true)
 |-- amt: decimal(10,3) (nullable = true)
 |-- prod_cat: string (nullable = true)
 |-- prod: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- mode: string (nullable = true)



In [None]:
txns_df.show(100,truncate=False)

+------+----------+-------+-------+----------------------+---------------------------------+----------------+--------------+------+
|txn_id|txn_dt    |cid    |amt    |prod_cat              |prod                             |city            |state         |mode  |
+------+----------+-------+-------+----------------------+---------------------------------+----------------+--------------+------+
|0     |2011-06-26|4007024|null   |Exercise & Fitness    |Cardio Machine Accessories       |Clarksville     |Tennessee     |credit|
|1     |2011-05-26|4006742|198.440|Exercise & Fitness    |Weightlifting Gloves             |Long Beach      |California    |credit|
|2     |2011-06-01|4009775|5.580  |Exercise & Fitness    |Weightlifting Machine Accessories|Anaheim         |California    |credit|
|3     |2011-06-05|4002199|198.190|Gymnastics            |Gymnastics Rings                 |Milwaukee       |Wisconsin     |credit|
|4     |2011-12-17|4002613|98.810 |Team Sports           |Field Hockey      

In [101]:
dt_schema=StructType([StructField("NAME",StringType()),
                       StructField("txn_dt",TimestampType()),
                            ])

In [109]:
date_df = spark.read.option("header",True).schema(dt_schema).option("TimestampFormat","dd/MM/yyyy HH:mm").csv("/content/date_format_3.txt")

In [110]:
date_df.show()

+-----+-------------------+
| NAME|             txn_dt|
+-----+-------------------+
|Ankit|2023-03-18 13:10:00|
+-----+-------------------+

