In [1]:
#import findspark module , which is helper tool to locate and initialize the pyspark
import findspark

In [2]:
#call the init method of findspace to set up the environement variables for pyspark
findspark.init()

In [3]:
#import spark session and row classes from the pyspark.sql module which are used to create and manipulate dataframes
from pyspark.sql import SparkSession, Row


In [4]:
#import all functions from pyspark.sql.functions module, which are used to perform various operations on dataframes
from pyspark.sql.functions import *

In [5]:
#import datetime classes from the datatime module , used for represent date and time.

from datetime import datetime, date
import pandas as pd
#import all types from pyspark.sql.types module , used to define the schema of our dataframe
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

In [6]:
#creating dataframe 

df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])

In [7]:
#printing the schema 
df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [8]:
#printing the dataframe 
df.show(1) # showing frst row in table format
df.show()  # showing entire dataframe 
df.first() # showing frst row in line .


+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+
only showing top 1 row

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  4|5.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))

In [9]:
#Create a PySpark DataFrame from a pandas DataFrame

pandas_df=pd.DataFrame({'a': [1, 2, 3],
    'b': [2., 3., 4.],
    'c': ['string1', 'string2', 'string3'],
    'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)]})
spark.createDataFrame(pandas_df).show()
spark.createDataFrame(pandas_df).printSchema()

+---+---+-------+----------+
|  a|  b|      c|         d|
+---+---+-------+----------+
|  1|2.0|string1|2000-01-01|
|  2|3.0|string2|2000-02-01|
|  3|4.0|string3|2000-03-01|
+---+---+-------+----------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)



In [10]:
df.count()
df.dtypes


[('a', 'bigint'),
 ('b', 'double'),
 ('c', 'string'),
 ('d', 'date'),
 ('e', 'timestamp')]

In [11]:
#Alternatively, you can enable spark.sql.repl.eagerEval.enabled configuration for the eager evaluation of PySpark DataFrame in notebooks such as Jupyte
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
df

a,b,c,d,e
1,2.0,string1,2000-01-01,2000-01-01 12:00:00
2,3.0,string2,2000-02-01,2000-01-02 12:00:00
4,5.0,string3,2000-03-01,2000-01-03 12:00:00


In [12]:
df.columns 

['a', 'b', 'c', 'd', 'e']

In [13]:
df.collect()

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)),
 Row(a=4, b=5.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [14]:
df.select(df.c).show()

+-------+
|      c|
+-------+
|string1|
|string2|
|string3|
+-------+



In [15]:
df.select(df.a).show()

+---+
|  a|
+---+
|  1|
|  2|
|  4|
+---+



In [16]:
#reading data 
df=spark.read.csv('amazon.csv', header=True)

In [17]:
df

product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,user_name,review_id,review_title,review_content,img_link,product_link
B07JW9H4J1,Wayona Nylon Brai...,Computers&Accesso...,₹399,"₹1,099",64%,4.2,24269,High Compatibilit...,AG3D6O4STAQKAY2UV...,"Manav,Adarsh gupt...","R3HXWT0LRP0NMF,R2...","Satisfied,Chargin...",Looks durable Cha...,https://m.media-a...,https://www.amazo...
B098NS6PVG,Ambrane Unbreakab...,Computers&Accesso...,₹199,₹349,43%,4.0,43994,Compatible with a...,AECPFYFQVRUWC3KGN...,"ArdKn,Nirbhay kum...","RGIQEG07R9HS2,R1S...",A Good Braided Ca...,I ordered this ca...,https://m.media-a...,https://www.amazo...
B096MSW6CT,Sounce Fast Phone...,Computers&Accesso...,₹199,"₹1,899",90%,3.9,7928,【 Fast Charger& D...,AGU3BBQ2V2DDAMOAK...,"Kunal,Himanshu,vi...","R3J3EQQ9TZI5ZJ,R3...",Good speed for ea...,Not quite durable...,https://m.media-a...,https://www.amazo...
B08HDJ86NZ,boAt Deuce USB 30...,Computers&Accesso...,₹329,₹699,53%,4.2,94363,The boAt Deuce US...,AEWAZDZZJLQUYVOVG...,"Omkar dhale,JD,HE...","R3EEUZKKK9J36I,R3...","Good product,Good...","Good product,long...",https://m.media-a...,https://www.amazo...
B08CF3B7N1,Portronics Konnec...,Computers&Accesso...,₹154,₹399,61%,4.2,16905,[CHARGE & SYNC FU...,AE3Q6KSUK5P75D5HF...,"rahuls6099,Swasat...","R1BP4L2HH9TFUP,R1...",As good as origin...,Bought this inste...,https://m.media-a...,https://www.amazo...
B08Y1TFSP6,pTron Solero TB30...,Computers&Accesso...,₹149,"₹1,000",85%,3.9,24871,Fast Charging & D...,AEQ2YMXSZWEOHK2EH...,"Jayesh,Rajesh k.,...","R7S8ANNSDPR40,R3C...","It's pretty good,...",It's a good produ...,https://m.media-a...,https://www.amazo...
B08WRWPM22,boAt Micro USB 55...,Computers&Accesso...,₹176.63,₹499,65%,4.1,15188,It Ensures High S...,AG7C6DAADCTRQJG2B...,"Vivek kumar,Amazo...","R8E73K2KWJRDS,RSD...","Long durable.,goo...",Build quality is ...,https://m.media-a...,https://www.amazo...
B08DDRGWTJ,MI Usb Type-C Cab...,Computers&Accesso...,₹229,₹299,23%,4.3,30411,1m long Type-C US...,AHW6E5LQ2BDYOIVLA...,"Pavan A H,Jayesh ...","R2X090D1YHACKR,R3...",Worth for money -...,Worth for money -...,https://m.media-a...,https://www.amazo...
B008IFXQFU,TP-Link USB WiFi ...,Computers&Accesso...,₹499,₹999,50%,4.2,179691,USB WiFi Adapter ...,AGV3IEFANZCKECFGU...,"Azhar JuMan,Aniru...","R1LW6NWSVTVZ2H,R3...",Works on linux fo...,I use this to con...,https://m.media-a...,https://www.amazo...
B082LZGK39,Ambrane Unbreakab...,Computers&Accesso...,₹199,₹299,33%,4.0,43994,Universal Compati...,AECPFYFQVRUWC3KGN...,"ArdKn,Nirbhay kum...","RGIQEG07R9HS2,R1S...",A Good Braided Ca...,I ordered this ca...,https://m.media-a...,https://www.amazo...


In [18]:
#registering dataframe as a table.
df.createOrReplaceTempView("amazon")
#runnig spark sql  to print the number of rows.
spark.sql("SELECT count(*) from amazon").show()

+--------+
|count(1)|
+--------+
|    1465|
+--------+



In [19]:
type(df)

pyspark.sql.dataframe.DataFrame

In [20]:
spark.stop()

In [21]:
from pyspark.sql import SparkSession

In [22]:
sc = SparkSession.builder.getOrCreate()

In [23]:
df=pd.DataFrame({'a':[1,2,3,4,5]})
df=sc.createDataFrame(df)
df.show()
type(df)

+---+
|  a|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
+---+



pyspark.sql.dataframe.DataFrame

In [24]:
sc.stop()

In [25]:
# Initialize SparkSession and sparksession
from pyspark.sql import SparkSession
from pyspark.sql import *

spark = SparkSession.builder.getOrCreate()

# Create an RDD using SparkSession
rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5])

# Perform an action to collect the RDD
type(rdd)

pyspark.rdd.RDD

In [26]:
rdd.map(lambda a:2*a ).collect()
rdd.map(lambda a:2*a ).take(2)

[2, 4]

In [27]:
type(rdd)
rdd.take(2)

[1, 2]

In [28]:
df

DataFrame[a: bigint]

In [29]:
df=pd.DataFrame({'a':[1,2,3,4,5,6,7,8]})
df=spark.createDataFrame(df)
df.show()


+---+
|  a|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
+---+



In [30]:
df1 = spark.createDataFrame([(1,), (2,), (3,)], ["number"])
df_map = df1.withColumn("number_x2", df1["number"] * 2)
df_map.show()

+------+---------+
|number|number_x2|
+------+---------+
|     1|        2|
|     2|        4|
|     3|        6|
+------+---------+



In [31]:
sc.stop()
